diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
| commit | 1d5ae1026e831016fc29fd927877c86af904481f (patch) | |
| tree | 2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target | |
| parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) | |
Notes
Diffstat (limited to 'lib/Target')
733 files changed, 38859 insertions, 20100 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 6965403a25ab..ac765ebcddc0 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -55,8 +55,9 @@ FunctionPass *createAArch64CollectLOHPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); -FunctionPass *createAArch64PreLegalizeCombiner(); -FunctionPass *createAArch64StackTaggingPass(); +FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone); +FunctionPass *createAArch64StackTaggingPass(bool MergeInit); +FunctionPass *createAArch64StackTaggingPreRAPass(); void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); @@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); +void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e39c6995e367..5b4c9e2149da 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -115,11 +115,12 @@ def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true", def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true", "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; -def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true", +def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true", "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; + def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", "Has zero-cycle zeroing instructions for generic registers">; @@ -284,6 +285,10 @@ def FeatureSEL2 : SubtargetFeature< "sel2", "HasSEL2", "true", "Enable v8.4-A Secure Exception Level 2 extension">; +def FeaturePMU : SubtargetFeature< + "pmu", "HasPMU", "true", + "Enable v8.4-A PMU extension">; + def FeatureTLB_RMI : SubtargetFeature< "tlb-rmi", "HasTLB_RMI", "true", "Enable v8.4-A TLB Range and Maintenance Instructions">; @@ -345,6 +350,21 @@ def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen", def FeatureMTE : SubtargetFeature<"mte", "HasMTE", "true", "Enable Memory Tagging Extension" >; +def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE", + "true", "Enable Trace Buffer Extension">; + +def FeatureETE : SubtargetFeature<"ete", "HasETE", + "true", "Enable Embedded Trace Extension", + [FeatureTRBE]>; + +def FeatureTME : SubtargetFeature<"tme", "HasTME", + "true", "Enable Transactional Memory Extension" >; + +def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", + "AllowTaggedGlobals", + "true", "Use an instruction sequence for taking the address of a global " + "that allows a memory tag in the upper address bits">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -354,7 +374,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", FeaturePAN, FeatureLOR, FeatureVH]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", - "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, + "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>; def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", @@ -364,7 +384,7 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT, - FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI, + FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI, FeatureFMI, FeatureRCPC_IMMO]>; def HasV8_5aOps : SubtargetFeature< @@ -390,6 +410,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -484,6 +505,19 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeaturePredictableSelectIsExpensive ]>; +def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", + "Cortex-A65 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRAS, + FeatureRCPC, + FeatureSSBS, + ]>; + def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ FeatureCRC, @@ -641,6 +675,33 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeatureSlowSTRQro ]>; +def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", + "NeoverseE1", + "Neoverse E1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRCPC, + FeatureSSBS, + ]>; + +def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", + "NeoverseN1", + "Neoverse N1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRCPC, + FeatureSPE, + FeatureSSBS, + ]>; + def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ FeatureCrypto, @@ -732,19 +793,28 @@ def : ProcessorModel<"generic", NoSchedModel, [ FeatureFuseAES, FeatureNEON, FeaturePerfMon, - FeaturePostRAScheduler + FeaturePostRAScheduler, +// ETE and TRBE are future architecture extensions. We temporariliy enable them +// by default for users targeting generic AArch64, until it is decided in which +// armv8.x-a architecture revision they will end up. The extensions do not +// affect code generated by the compiler and can be used only by explicitly +// mentioning the new system register names in assembly. + FeatureETE ]>; -// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53. def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; +def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>; +def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>; def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; +def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 92c8c4955d50..13d389cec7a0 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -552,7 +552,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, std::vector<unsigned> ToErase; for (auto &U : I.operands()) { if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { - unsigned OrigReg = U.getReg(); + Register OrigReg = U.getReg(); U.setReg(Substs[OrigReg]); if (U.isKill()) // Don't erase straight away, because there may be other operands @@ -611,12 +611,12 @@ void AArch64A57FPLoadBalancing::scanInstruction( // Create a new chain. Multiplies don't require forwarding so can go on any // unit. - unsigned DestReg = MI->getOperand(0).getReg(); + Register DestReg = MI->getOperand(0).getReg(); LLVM_DEBUG(dbgs() << "New chain started for register " << printReg(DestReg, TRI) << " at " << *MI); - auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); AllChains.push_back(std::move(G)); @@ -624,8 +624,8 @@ void AArch64A57FPLoadBalancing::scanInstruction( // It is beneficial to keep MLAs on the same functional unit as their // accumulator operand. - unsigned DestReg = MI->getOperand(0).getReg(); - unsigned AccumReg = MI->getOperand(3).getReg(); + Register DestReg = MI->getOperand(0).getReg(); + Register AccumReg = MI->getOperand(3).getReg(); maybeKillChain(MI->getOperand(1), Idx, ActiveChains); maybeKillChain(MI->getOperand(2), Idx, ActiveChains); @@ -661,7 +661,7 @@ void AArch64A57FPLoadBalancing::scanInstruction( LLVM_DEBUG(dbgs() << "Creating new chain for dest register " << printReg(DestReg, TRI) << "\n"); - auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); AllChains.push_back(std::move(G)); diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 89404463e1f0..981b366c14b1 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -105,14 +105,14 @@ static bool isGPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { if (SubReg) return false; - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); return AArch64::GPR64RegClass.contains(Reg); } static bool isFPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && SubReg == 0) || (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && @@ -201,8 +201,8 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform( unsigned NumNewCopies = 3; unsigned NumRemovableCopies = 0; - unsigned OrigSrc0 = MI.getOperand(1).getReg(); - unsigned OrigSrc1 = MI.getOperand(2).getReg(); + Register OrigSrc0 = MI.getOperand(1).getReg(); + Register OrigSrc1 = MI.getOperand(2).getReg(); unsigned SubReg0; unsigned SubReg1; if (!MRI->def_empty(OrigSrc0)) { @@ -236,7 +236,7 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform( // any of the uses is a transformable instruction, it's likely the tranforms // will chain, enabling us to save a copy there, too. This is an aggressive // heuristic that approximates the graph based cost analysis described above. - unsigned Dst = MI.getOperand(0).getReg(); + Register Dst = MI.getOperand(0).getReg(); bool AllUsesAreCopies = true; for (MachineRegisterInfo::use_instr_nodbg_iterator Use = MRI->use_instr_nodbg_begin(Dst), @@ -293,8 +293,8 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { assert(OldOpc != NewOpc && "transform an instruction to itself?!"); // Check if we need a copy for the source registers. - unsigned OrigSrc0 = MI.getOperand(1).getReg(); - unsigned OrigSrc1 = MI.getOperand(2).getReg(); + Register OrigSrc0 = MI.getOperand(1).getReg(); + Register OrigSrc1 = MI.getOperand(2).getReg(); unsigned Src0 = 0, SubReg0; unsigned Src1 = 0, SubReg1; bool KillSrc0 = false, KillSrc1 = false; @@ -354,7 +354,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { // Create a vreg for the destination. // FIXME: No need to do this if the ultimate user expects an FPR64. // Check for that and avoid the copy if possible. - unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); // For now, all of the new instructions have the same simple three-register // form, so no need to special case based on what instruction we're diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 094fbd999523..7ea7915c2ca6 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -99,7 +99,8 @@ public: void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI); - std::map<std::pair<unsigned, uint32_t>, MCSymbol *> HwasanMemaccessSymbols; + typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple; + std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols; void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI); void EmitHwasanMemaccessSymbols(Module &M); @@ -150,7 +151,7 @@ private: void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); bool printAsmRegInClass(const MachineOperand &MO, - const TargetRegisterClass *RC, bool isVector, + const TargetRegisterClass *RC, unsigned AltName, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, @@ -236,9 +237,12 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) } void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); + bool IsShort = + MI.getOpcode() == AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES; uint32_t AccessInfo = MI.getOperand(1).getImm(); - MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}]; + MCSymbol *&Sym = + HwasanMemaccessSymbols[HwasanMemaccessTuple(Reg, IsShort, AccessInfo)]; if (!Sym) { // FIXME: Make this work on non-ELF. if (!TM.getTargetTriple().isOSBinFormatELF()) @@ -246,6 +250,8 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" + utostr(AccessInfo); + if (IsShort) + SymName += "_short"; Sym = OutContext.getOrCreateSymbol(SymName); } @@ -263,15 +269,22 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { std::unique_ptr<MCSubtargetInfo> STI( TM.getTarget().createMCSubtargetInfo(TT.str(), "", "")); - MCSymbol *HwasanTagMismatchSym = + MCSymbol *HwasanTagMismatchV1Sym = OutContext.getOrCreateSymbol("__hwasan_tag_mismatch"); + MCSymbol *HwasanTagMismatchV2Sym = + OutContext.getOrCreateSymbol("__hwasan_tag_mismatch_v2"); - const MCSymbolRefExpr *HwasanTagMismatchRef = - MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext); + const MCSymbolRefExpr *HwasanTagMismatchV1Ref = + MCSymbolRefExpr::create(HwasanTagMismatchV1Sym, OutContext); + const MCSymbolRefExpr *HwasanTagMismatchV2Ref = + MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext); for (auto &P : HwasanMemaccessSymbols) { - unsigned Reg = P.first.first; - uint32_t AccessInfo = P.first.second; + unsigned Reg = std::get<0>(P.first); + bool IsShort = std::get<1>(P.first); + uint32_t AccessInfo = std::get<2>(P.first); + const MCSymbolRefExpr *HwasanTagMismatchRef = + IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref; MCSymbol *Sym = P.second; OutStreamer->SwitchSection(OutContext.getELFSection( @@ -304,82 +317,86 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { .addReg(Reg) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), *STI); - MCSymbol *HandlePartialSym = OutContext.createTempSymbol(); + MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol(); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::NE) - .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)), + .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym, + OutContext)), *STI); MCSymbol *ReturnSym = OutContext.createTempSymbol(); OutStreamer->EmitLabel(ReturnSym); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI); + OutStreamer->EmitLabel(HandleMismatchOrPartialSym); - OutStreamer->EmitLabel(HandlePartialSym); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) - .addReg(AArch64::WZR) - .addReg(AArch64::W16) - .addImm(15) - .addImm(0), - *STI); - MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::HI) - .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), - *STI); + if (IsShort) { + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) + .addReg(AArch64::WZR) + .addReg(AArch64::W16) + .addImm(15) + .addImm(0), + *STI); + MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::HI) + .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), + *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::ANDXri) - .addReg(AArch64::X17) - .addReg(Reg) - .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), - *STI); - unsigned Size = 1 << (AccessInfo & 0xf); - if (Size != 1) - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) - .addReg(AArch64::X17) - .addReg(AArch64::X17) - .addImm(Size - 1) + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ANDXri) + .addReg(AArch64::X17) + .addReg(Reg) + .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), + *STI); + unsigned Size = 1 << (AccessInfo & 0xf); + if (Size != 1) + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X17) + .addReg(AArch64::X17) + .addImm(Size - 1) + .addImm(0), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) + .addReg(AArch64::WZR) + .addReg(AArch64::W16) + .addReg(AArch64::W17) .addImm(0), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) - .addReg(AArch64::WZR) - .addReg(AArch64::W16) - .addReg(AArch64::W17) - .addImm(0), - *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::LS) - .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), - *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::LS) + .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), + *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::ORRXri) - .addReg(AArch64::X16) - .addReg(Reg) - .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), - *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) - .addReg(AArch64::W16) - .addReg(AArch64::X16) - .addImm(0), - *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::SUBSXrs) - .addReg(AArch64::XZR) - .addReg(AArch64::X16) - .addReg(Reg) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), - *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::EQ) - .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), - *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ORRXri) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) + .addReg(AArch64::W16) + .addReg(AArch64::X16) + .addImm(0), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::SUBSXrs) + .addReg(AArch64::XZR) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::EQ) + .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), + *STI); + + OutStreamer->EmitLabel(HandleMismatchSym); + } - OutStreamer->EmitLabel(HandleMismatchSym); OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre) .addReg(AArch64::SP) .addReg(AArch64::X0) @@ -414,16 +431,16 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { MCInstBuilder(AArch64::ADRP) .addReg(AArch64::X16) .addExpr(AArch64MCExpr::create( - HwasanTagMismatchRef, - AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)), + HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE, + OutContext)), *STI); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::LDRXui) .addReg(AArch64::X16) .addReg(AArch64::X16) .addExpr(AArch64MCExpr::create( - HwasanTagMismatchRef, - AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)), + HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12, + OutContext)), *STI); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); @@ -485,15 +502,14 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, default: llvm_unreachable("<unknown operand type>"); case MachineOperand::MO_Register: { - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + Register Reg = MO.getReg(); + assert(Register::isPhysicalRegister(Reg)); assert(!MO.getSubReg() && "Subregs should be eliminated!"); O << AArch64InstPrinter::getRegisterName(Reg); break; } case MachineOperand::MO_Immediate: { - int64_t Imm = MO.getImm(); - O << '#' << Imm; + O << MO.getImm(); break; } case MachineOperand::MO_GlobalAddress: { @@ -510,7 +526,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); switch (Mode) { default: return true; // Unknown mode. @@ -531,14 +547,13 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, // printing. bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, - bool isVector, raw_ostream &O) { + unsigned AltName, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); const TargetRegisterInfo *RI = STI->getRegisterInfo(); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); - O << AArch64InstPrinter::getRegisterName( - RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName); + O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName); return false; } @@ -574,6 +589,7 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 's': // Print S register. case 'd': // Print D register. case 'q': // Print Q register. + case 'z': // Print Z register. if (MO.isReg()) { const TargetRegisterClass *RC; switch (ExtraCode[0]) { @@ -592,10 +608,13 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 'q': RC = &AArch64::FPR128RegClass; break; + case 'z': + RC = &AArch64::ZPRRegClass; + break; default: return true; } - return printAsmRegInClass(MO, RC, false /* vector */, O); + return printAsmRegInClass(MO, RC, AArch64::NoRegAltName, O); } printOperand(MI, OpNum, O); return false; @@ -605,16 +624,26 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, // According to ARM, we should emit x and v registers unless we have a // modifier. if (MO.isReg()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // If this is a w or x register, print an x register. if (AArch64::GPR32allRegClass.contains(Reg) || AArch64::GPR64allRegClass.contains(Reg)) return printAsmMRegister(MO, 'x', O); + unsigned AltName = AArch64::NoRegAltName; + const TargetRegisterClass *RegClass; + if (AArch64::ZPRRegClass.contains(Reg)) { + RegClass = &AArch64::ZPRRegClass; + } else if (AArch64::PPRRegClass.contains(Reg)) { + RegClass = &AArch64::PPRRegClass; + } else { + RegClass = &AArch64::FPR128RegClass; + AltName = AArch64::vreg; + } + // If this is a b, h, s, d, or q register, print it as a v register. - return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */, - O); + return printAsmRegInClass(MO, RegClass, AltName, O); } printOperand(MI, OpNum, O); @@ -682,7 +711,7 @@ void AArch64AsmPrinter::EmitJumpTableInfo() { if (JTBBs.empty()) continue; unsigned Size = AFI->getJumpTableEntrySize(JTI); - EmitAlignment(Log2_32(Size)); + EmitAlignment(Align(Size)); OutStreamer->EmitLabel(GetJTISymbol(JTI)); for (auto *JTBB : JTBBs) @@ -725,12 +754,12 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI, /// add xDest, xDest, xScratch, lsl #2 void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer, const llvm::MachineInstr &MI) { - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned ScratchReg = MI.getOperand(1).getReg(); - unsigned ScratchRegW = + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register ScratchRegW = STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32); - unsigned TableReg = MI.getOperand(2).getReg(); - unsigned EntryReg = MI.getOperand(3).getReg(); + Register TableReg = MI.getOperand(2).getReg(); + Register EntryReg = MI.getOperand(3).getReg(); int JTIdx = MI.getOperand(4).getIndex(); bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8; @@ -800,7 +829,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, if (CallTarget) { assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && "High 16 bits of call target should be zero."); - unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 16; // Materialize the jump address: EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi) @@ -830,7 +859,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, } void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { // Convert H/S/D register to corresponding Q register if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) @@ -894,32 +923,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { default: break; case AArch64::MOVMCSym: { - unsigned DestReg = MI->getOperand(0).getReg(); - const MachineOperand &MO_Sym = MI->getOperand(1); - MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym); - MCOperand Hi_MCSym, Lo_MCSym; + Register DestReg = MI->getOperand(0).getReg(); + const MachineOperand &MO_Sym = MI->getOperand(1); + MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym); + MCOperand Hi_MCSym, Lo_MCSym; - Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S); - Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC); + Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S); + Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC); - MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym); - MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym); + MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym); + MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym); - MCInst MovZ; - MovZ.setOpcode(AArch64::MOVZXi); - MovZ.addOperand(MCOperand::createReg(DestReg)); - MovZ.addOperand(Hi_MCSym); - MovZ.addOperand(MCOperand::createImm(16)); - EmitToStreamer(*OutStreamer, MovZ); + MCInst MovZ; + MovZ.setOpcode(AArch64::MOVZXi); + MovZ.addOperand(MCOperand::createReg(DestReg)); + MovZ.addOperand(Hi_MCSym); + MovZ.addOperand(MCOperand::createImm(16)); + EmitToStreamer(*OutStreamer, MovZ); - MCInst MovK; - MovK.setOpcode(AArch64::MOVKXi); - MovK.addOperand(MCOperand::createReg(DestReg)); - MovK.addOperand(MCOperand::createReg(DestReg)); - MovK.addOperand(Lo_MCSym); - MovK.addOperand(MCOperand::createImm(0)); - EmitToStreamer(*OutStreamer, MovK); - return; + MCInst MovK; + MovK.setOpcode(AArch64::MOVKXi); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(Lo_MCSym); + MovK.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MovK); + return; } case AArch64::MOVIv2d_ns: // If the target has <rdar://problem/16473581>, lower this @@ -1084,6 +1113,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; case AArch64::HWASAN_CHECK_MEMACCESS: + case AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES: LowerHWASAN_CHECK_MEMACCESS(*MI); return; @@ -1193,4 +1223,6 @@ extern "C" void LLVMInitializeAArch64AsmPrinter() { RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget()); RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget()); RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target()); + RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target()); + RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target()); } diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp index 59757769c89a..ed93d02aa615 100644 --- a/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -99,7 +99,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { /// (it's an implicit-def of the BL). virtual void markPhysRegUsed(unsigned PhysReg) = 0; - bool isArgumentHandler() const override { return true; } + bool isIncomingArgumentHandler() const override { return true; } uint64_t StackUsed; }; @@ -110,6 +110,7 @@ struct FormalArgHandler : public IncomingArgHandler { : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -129,14 +130,29 @@ struct CallReturnHandler : public IncomingArgHandler { struct OutgoingArgHandler : public CallLowering::ValueHandler { OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn, - CCAssignFn *AssignFnVarArg) + CCAssignFn *AssignFnVarArg, bool IsTailCall = false, + int FPDiff = 0) : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), - AssignFnVarArg(AssignFnVarArg), StackSize(0) {} + AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), + StackSize(0) {} + + bool isIncomingArgumentHandler() const override { return false; } Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { + MachineFunction &MF = MIRBuilder.getMF(); LLT p0 = LLT::pointer(0, 64); LLT s64 = LLT::scalar(64); + + if (IsTailCall) { + Offset += FPDiff; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + Register FIReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildFrameIndex(FIReg, FI); + MPO = MachinePointerInfo::getFixedStack(MF, FI); + return FIReg; + } + Register SPReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildCopy(SPReg, Register(AArch64::SP)); @@ -146,7 +162,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); - MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); + MPO = MachinePointerInfo::getStack(MF, Offset); return AddrReg; } @@ -173,12 +189,13 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, + ISD::ArgFlagsTy Flags, CCState &State) override { bool Res; if (Info.IsFixed) - Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); else - Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State); StackSize = State.getNextStackOffset(); return Res; @@ -186,10 +203,19 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { MachineInstrBuilder MIB; CCAssignFn *AssignFnVarArg; + bool IsTailCall; + + /// For tail calls, the byte offset of the call's argument area from the + /// callee's. Unused elsewhere. + int FPDiff; uint64_t StackSize; }; } // namespace +static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { + return CallConv == CallingConv::Fast && TailCallOpt; +} + void AArch64CallLowering::splitToValueTypes( const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const { @@ -207,7 +233,7 @@ void AArch64CallLowering::splitToValueTypes( // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags, OrigArg.IsFixed); + OrigArg.Flags[0], OrigArg.IsFixed); return; } @@ -218,13 +244,13 @@ void AArch64CallLowering::splitToValueTypes( OrigArg.Ty, CallConv, false); for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); - SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags, + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], OrigArg.IsFixed); if (NeedsRegBlock) - SplitArgs.back().Flags.setInConsecutiveRegs(); + SplitArgs.back().Flags[0].setInConsecutiveRegs(); } - SplitArgs.back().Flags.setInConsecutiveRegsLast(); + SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); } bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, @@ -344,6 +370,49 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return Success; } +/// Helper function to compute forwarded registers for musttail calls. Computes +/// the forwarded registers, sets MBB liveness, and emits COPY instructions that +/// can be used to save + restore registers later. +static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, + CCAssignFn *AssignFn) { + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineFunction &MF = MIRBuilder.getMF(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!MFI.hasMustTailInVarArgFunc()) + return; + + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + const Function &F = MF.getFunction(); + assert(F.isVarArg() && "Expected F to be vararg?"); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs, + F.getContext()); + SmallVector<MVT, 2> RegParmTypes; + RegParmTypes.push_back(MVT::i64); + RegParmTypes.push_back(MVT::f128); + + // Later on, we can use this vector to restore the registers if necessary. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // Conservatively forward X8, since it might be used for an aggregate + // return. + if (!CCInfo.isAllocated(AArch64::X8)) { + unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); + } + + // Add the forwards to the MachineBasicBlock and MachineFunction. + for (const auto &F : Forwards) { + MBB.addLiveIn(F.PReg); + MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg)); + } +} + bool AArch64CallLowering::lowerFormalArguments( MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { @@ -376,64 +445,530 @@ bool AArch64CallLowering::lowerFormalArguments( if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + uint64_t StackOffset = Handler.StackUsed; if (F.isVarArg()) { - if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) { - // FIXME: we need to reimplement saveVarArgsRegisters from + auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + if (!Subtarget.isTargetDarwin()) { + // FIXME: we need to reimplement saveVarArgsRegisters from // AArch64ISelLowering. return false; } - // We currently pass all varargs at 8-byte alignment. - uint64_t StackOffset = alignTo(Handler.StackUsed, 8); + // We currently pass all varargs at 8-byte alignment, or 4 in ILP32. + StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8); auto &MFI = MIRBuilder.getMF().getFrameInfo(); - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); } + if (doesCalleeRestoreStack(F.getCallingConv(), + MF.getTarget().Options.GuaranteedTailCallOpt)) { + // We have a non-standard ABI, so why not make full use of the stack that + // we're going to pop? It must be aligned to 16 B in any case. + StackOffset = alignTo(StackOffset, 16); + + // If we're expected to restore the stack (e.g. fastcc), then we'll be + // adding a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackOffset); + + // Our own callers will guarantee that the space is free by giving an + // aligned value to CALLSEQ_START. + } + + // When we tail call, we need to check if the callee's arguments + // will fit on the caller's stack. So, whenever we lower formal arguments, + // we should keep track of this information, since we might lower a tail call + // in this function later. + FuncInfo->setBytesInStackArgArea(StackOffset); + auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); if (Subtarget.hasCustomCallingConv()) Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); + handleMustTailForwardedRegisters(MIRBuilder, AssignFn); + // Move back to the end of the basic block. MIRBuilder.setMBB(MBB); return true; } +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::PreserveMost: + case CallingConv::Swift: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for +/// CC. +static std::pair<CCAssignFn *, CCAssignFn *> +getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) { + return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; +} + +bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const { + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + + // If the calling conventions match, then everything must be the same. + if (CalleeCC == CallerCC) + return true; + + // Check if the caller and callee will handle arguments in the same way. + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + CCAssignFn *CalleeAssignFnFixed; + CCAssignFn *CalleeAssignFnVarArg; + std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) = + getAssignFnsForCC(CalleeCC, TLI); + + CCAssignFn *CallerAssignFnFixed; + CCAssignFn *CallerAssignFnVarArg; + std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) = + getAssignFnsForCC(CallerCC, TLI); + + if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed, + *CalleeAssignFnVarArg, *CallerAssignFnFixed, + *CallerAssignFnVarArg)) + return false; + + // Make sure that the caller and callee preserve all of the same registers. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) { + TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); + TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); + } + + return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved); +} + +bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const { + // If there are no outgoing arguments, then we are done. + if (OutArgs.empty()) + return true; + + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + // We have outgoing arguments. Make sure that we can tail call with them. + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext()); + + if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) { + LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n"); + return false; + } + + // Make sure that they can fit on the caller's stack. + const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { + LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); + return false; + } + + // Verify that the parameters in callee-saved registers match. + // TODO: Port this over to CallLowering as general code once swiftself is + // supported. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (unsigned i = 0; i < OutLocs.size(); ++i) { + auto &ArgLoc = OutLocs[i]; + // If it's not a register, it's fine. + if (!ArgLoc.isRegLoc()) { + if (Info.IsVarArg) { + // Be conservative and disallow variadic memory operands to match SDAG's + // behaviour. + // FIXME: If the caller's calling convention is C, then we can + // potentially use its argument area. However, for cases like fastcc, + // we can't do anything. + LLVM_DEBUG( + dbgs() + << "... Cannot tail call vararg function with stack arguments\n"); + return false; + } + continue; + } + + Register Reg = ArgLoc.getLocReg(); + + // Only look at callee-saved registers. + if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg)) + continue; + + LLVM_DEBUG( + dbgs() + << "... Call has an argument passed in a callee-saved register.\n"); + + // Check if it was copied from. + ArgInfo &OutInfo = OutArgs[i]; + + if (OutInfo.Regs.size() > 1) { + LLVM_DEBUG( + dbgs() << "... Cannot handle arguments in multiple registers.\n"); + return false; + } + + // Check if we copy the register, walking through copies from virtual + // registers. Note that getDefIgnoringCopies does not ignore copies from + // physical registers. + MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI); + if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) { + LLVM_DEBUG( + dbgs() + << "... Parameter was not copied into a VReg, cannot tail call.\n"); + return false; + } + + // Got a copy. Verify that it's the same as the register we want. + Register CopyRHS = RegDef->getOperand(1).getReg(); + if (CopyRHS != Reg) { + LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into " + "VReg, cannot tail call.\n"); + return false; + } + } + + return true; +} + +bool AArch64CallLowering::isEligibleForTailCallOptimization( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const { + + // Must pass all target-independent checks in order to tail call optimize. + if (!Info.IsTailCall) + return false; + + CallingConv::ID CalleeCC = Info.CallConv; + MachineFunction &MF = MIRBuilder.getMF(); + const Function &CallerF = MF.getFunction(); + + LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n"); + + if (Info.SwiftErrorVReg) { + // TODO: We should handle this. + // Note that this is also handled by the check for no outgoing arguments. + // Proactively disabling this though, because the swifterror handling in + // lowerCall inserts a COPY *after* the location of the call. + LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n"); + return false; + } + + if (!mayTailCallThisCC(CalleeCC)) { + LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n"); + return false; + } + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86). + // + // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try + // it? + // + // On Windows, "inreg" attributes signify non-aggregate indirect returns. + // In this case, it is necessary to save/restore X0 in the callee. Tail + // call opt interferes with this. So we disable tail call opt when the + // caller has an argument with "inreg" attribute. + // + // FIXME: Check whether the callee also has an "inreg" argument. + // + // When the caller has a swifterror argument, we don't want to tail call + // because would have to move into the swifterror register before the + // tail call. + if (any_of(CallerF.args(), [](const Argument &A) { + return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr(); + })) { + LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, " + "inreg, or swifterror arguments\n"); + return false; + } + + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (Info.Callee.isGlobal()) { + const GlobalValue *GV = Info.Callee.getGlobal(); + const Triple &TT = MF.getTarget().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || + TT.isOSBinFormatMachO())) { + LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function " + "with weak linkage for this OS.\n"); + return false; + } + } + + // If we have -tailcallopt, then we're done. + if (MF.getTarget().Options.GuaranteedTailCallOpt) + return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); + + // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall). + // Try to find cases where we can do that. + + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!Info.IsVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + // Verify that the incoming and outgoing arguments from the callee are + // safe to tail call. + if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { + LLVM_DEBUG( + dbgs() + << "... Caller and callee have incompatible calling conventions.\n"); + return false; + } + + if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) + return false; + + LLVM_DEBUG( + dbgs() << "... Call is eligible for tail call optimization.\n"); + return true; +} + +static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect, + bool IsTailCall) { + if (!IsTailCall) + return IsIndirect ? AArch64::BLR : AArch64::BL; + + if (!IsIndirect) + return AArch64::TCRETURNdi; + + // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use + // x16 or x17. + if (CallerF.hasFnAttribute("branch-target-enforcement")) + return AArch64::TCRETURNriBTI; + + return AArch64::TCRETURNri; +} + +bool AArch64CallLowering::lowerTailCall( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + + // True when we're tail calling, but without -tailcallopt. + bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; + + // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 + // register class. Until we can do that, we should fall back here. + if (F.hasFnAttribute("branch-target-enforcement")) { + LLVM_DEBUG( + dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n"); + return false; + } + + // Find out which ABI gets to decide where things go. + CallingConv::ID CalleeCC = Info.CallConv; + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + MachineInstrBuilder CallSeqStart; + if (!IsSibCall) + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + + unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true); + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); + + // Byte offset for the tail call. When we are sibcalling, this will always + // be 0. + MIB.addImm(0); + + // Tell the call which registers are clobbered. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv()); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) + TRI->UpdateCustomCallPreservedMask(MF, &Mask); + MIB.addRegMask(Mask); + + if (TRI->isAnyArgRegReserved(MF)) + TRI->emitReservedArgRegCallError(MF); + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. + int FPDiff = 0; + + // This will be 0 for sibcalls, potentially nonzero for tail calls produced + // by -tailcallopt. For sibcalls, the memory operands for the call are + // already available in the caller's incoming argument space. + unsigned NumBytes = 0; + if (!IsSibCall) { + // We aren't sibcalling, so we need to compute FPDiff. We need to do this + // before handling assignments, because FPDiff must be known for memory + // arguments. + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); + analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg); + + // The callee will pop the argument stack as a tail call. Thus, we must + // keep it 16-byte aligned. + NumBytes = alignTo(OutInfo.getNextStackOffset(), 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + + const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + + // Do the actual argument marshalling. + SmallVector<unsigned, 8> PhysRegs; + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg, true, FPDiff); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) + return false; + + if (Info.IsVarArg && Info.IsMustTailCall) { + // Now we know what's being passed to the function. Add uses to the call for + // the forwarded registers that we *aren't* passing as parameters. This will + // preserve the copies we build earlier. + for (const auto &F : Forwards) { + Register ForwardedReg = F.PReg; + // If the register is already passed, or aliases a register which is + // already being passed, then skip it. + if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) { + if (!Use.isReg()) + return false; + return TRI->regsOverlap(Use.getReg(), ForwardedReg); + })) + continue; + + // We aren't passing it already, so we should add it to the call. + MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg)); + MIB.addReg(ForwardedReg, RegState::Implicit); + } + } + + // If we have -tailcallopt, we need to adjust the stack. We'll do the call + // sequence start and end here. + if (!IsSibCall) { + MIB->getOperand(1).setImm(FPDiff); + CallSeqStart.addImm(NumBytes).addImm(0); + // End the call sequence *before* emitting the call. Normally, we would + // tidy the frame up after the call. However, here, we've laid out the + // parameters so that when SP is reset, they will be in the correct + // location. + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0); + } + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific instruction, + // it must have a register class matching the constraint of that instruction. + if (Info.Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); + + MF.getFrameInfo().setHasTailCall(); + Info.LoweredTailCall = true; + return true; +} + bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs, - Register SwiftErrorVReg) const { + CallLoweringInfo &Info) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); auto &DL = F.getParent()->getDataLayout(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); - SmallVector<ArgInfo, 8> SplitArgs; - for (auto &OrigArg : OrigArgs) { - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv); + SmallVector<ArgInfo, 8> OutArgs; + for (auto &OrigArg : Info.OrigArgs) { + splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv); // AAPCS requires that we zero-extend i1 to 8 bits by the caller. if (OrigArg.Ty->isIntegerTy(1)) - SplitArgs.back().Flags.setZExt(); + OutArgs.back().Flags[0].setZExt(); } + SmallVector<ArgInfo, 8> InArgs; + if (!Info.OrigRet.Ty->isVoidTy()) + splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv()); + + // If we can lower as a tail call, do that instead. + bool CanTailCallOpt = + isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs); + + // We must emit a tail call if we have musttail. + if (Info.IsMustTailCall && !CanTailCallOpt) { + // There are types of incoming/outgoing arguments we can't handle yet, so + // it doesn't make sense to actually die here like in ISelLowering. Instead, + // fall back to SelectionDAG and let it try to handle this. + LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); + return false; + } + + if (CanTailCallOpt) + return lowerTailCall(MIRBuilder, Info, OutArgs); + // Find out which ABI gets to decide where things go. - const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); - CCAssignFn *AssignFnFixed = - TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); - CCAssignFn *AssignFnVarArg = - TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true); + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = + getAssignFnsForCC(Info.CallConv, TLI); - auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + MachineInstrBuilder CallSeqStart; + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR - : AArch64::BL); - MIB.add(Callee); + unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false); + + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); // Tell the call which registers are clobbered. auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); @@ -448,8 +983,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Do the actual argument marshalling. SmallVector<unsigned, 8> PhysRegs; OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + AssignFnVarArg, false); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) return false; // Now we can add the actual call instruction to the correct basic block. @@ -458,34 +993,37 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. - if (Callee.isReg()) + if (Info.Callee.isReg()) MIB->getOperand(0).setReg(constrainOperandRegClass( MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0)); + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arugments, the physical register must be an // implicit-define of the call instruction. - CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); - if (!OrigRet.Ty->isVoidTy()) { - SplitArgs.clear(); - - splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv()); - + if (!Info.OrigRet.Ty->isVoidTy()) { + CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + if (!handleAssignments(MIRBuilder, InArgs, Handler)) return false; } - if (SwiftErrorVReg) { + if (Info.SwiftErrorVReg) { MIB.addDef(AArch64::X21, RegState::Implicit); - MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21)); + MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21)); } + uint64_t CalleePopBytes = + doesCalleeRestoreStack(Info.CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt) + ? alignTo(Handler.StackSize, 16) + : 0; + CallSeqStart.addImm(Handler.StackSize).addImm(0); MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) .addImm(Handler.StackSize) - .addImm(0); + .addImm(CalleePopBytes); return true; } diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h index 4f428f254537..b0c601c7062c 100644 --- a/lib/Target/AArch64/AArch64CallLowering.h +++ b/lib/Target/AArch64/AArch64CallLowering.h @@ -40,16 +40,15 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs, - Register SwiftErrorVReg) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override { - return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0); - } + /// Returns true if the call can be lowered as a tail call. + bool + isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const; bool supportSwiftError() const override { return true; } @@ -64,6 +63,18 @@ private: SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const; + + bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const; + + bool + doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, + MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const; + + bool + areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp index 02538a187611..a0695cef615f 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -40,12 +40,14 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, unsigned SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; - unsigned StackAlign = + const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); - unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); + const Align OrigAlign(ArgFlags.getOrigAlign()); + const Align Align = std::min(OrigAlign, StackAlign); for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); + It.convertToMem(State.AllocateStack( + Size, std::max((unsigned)Align.value(), SlotAlign))); State.addLoc(It); SlotAlign = 1; } @@ -79,10 +81,14 @@ static bool CC_AArch64_Custom_Stack_Block( static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { + const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( + State.getMachineFunction().getSubtarget()); + bool IsDarwinILP32 = Subtarget.isTargetILP32() && Subtarget.isTargetMachO(); + // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. ArrayRef<MCPhysReg> RegList; - if (LocVT.SimpleTy == MVT::i64) + if (LocVT.SimpleTy == MVT::i64 || (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32)) RegList = XRegList; else if (LocVT.SimpleTy == MVT::f16) RegList = HRegList; @@ -107,8 +113,12 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (!ArgFlags.isInConsecutiveRegsLast()) return true; - unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); - if (RegResult) { + // [N x i32] arguments get packed into x-registers on Darwin's arm64_32 + // because that's how the armv7k Clang front-end emits small structs. + unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1; + unsigned RegResult = State.AllocateRegBlock( + RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg); + if (RegResult && EltsPerReg == 1) { for (auto &It : PendingMembers) { It.convertToReg(RegResult); State.addLoc(It); @@ -116,14 +126,26 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, } PendingMembers.clear(); return true; + } else if (RegResult) { + assert(EltsPerReg == 2 && "unexpected ABI"); + bool UseHigh = false; + CCValAssign::LocInfo Info; + for (auto &It : PendingMembers) { + Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt; + State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult, + MVT::i64, Info)); + UseHigh = !UseHigh; + if (!UseHigh) + ++RegResult; + } + PendingMembers.clear(); + return true; } // Mark all regs in the class as unavailable for (auto Reg : RegList) State.AllocateReg(Reg); - const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( - State.getMachineFunction().getSubtarget()); unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h index 13cc0c583fd2..5a55d090d7c8 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.h +++ b/lib/Target/AArch64/AArch64CallingConvention.h @@ -25,6 +25,9 @@ bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index d969a9e1ab3a..bccbbd4591ed 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -17,6 +17,10 @@ class CCIfAlign<string Align, CCAction A> : class CCIfBigEndian<CCAction A> : CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; +class CCIfILP32<CCAction A> : + CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>; + + //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention //===----------------------------------------------------------------------===// @@ -70,6 +74,18 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + CCPassIndirect<i64>>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>>, + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCPassIndirect<i64>>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -111,6 +127,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCIfType<[v2f32], CCBitConvertToType<v2i32>>, CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>, // Big endian vectors must be passed as if they were 1-element vectors so that @@ -135,7 +152,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], - CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>> ]>; // Vararg functions on windows pass floats in integer registers @@ -202,6 +226,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Re-demote pointers to 32-bits so we don't end up storing 64-bit + // values and clobbering neighbouring stack locations. Not very pretty. + CCIfPtr<CCIfILP32<CCTruncToType<i32>>>, + CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], CCAssignToStack<8, 8>>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], @@ -229,6 +259,29 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ CCAssignToStack<16, 16>> ]>; +// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the +// same as the normal Darwin VarArgs handling. +let Entry = 1 in +def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType<v2i32>>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>, + + // Handle all scalar types as either i32 or f32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + CCIfType<[f16], CCPromoteToType<f32>>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfPtr<CCIfILP32<CCTruncToType<i32>>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> +]>; + + // The WebKit_JS calling convention only passes the first argument (the callee) // in register and the remaining arguments on stack. We allow 32bit stack slots, // so that WebKit can write partial values in the stack and define the other @@ -298,6 +351,12 @@ def CC_AArch64_GHC : CallingConv<[ CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>> ]>; +// The order of the callee-saves in this file is important, because the +// FrameLowering code will use this order to determine the layout the +// callee-save area in the stack frame. As can be observed below, Darwin +// requires the frame-record (LR, FP) to be at the top the callee-save area, +// whereas for other platforms they are at the bottom. + // FIXME: LR is only callee-saved in the sense that *we* preserve it and are // presumably a callee to someone. External functions may not do so, but this // is currently safe since BL has LR as an implicit-def and what happens after a @@ -306,7 +365,13 @@ def CC_AArch64_GHC : CallingConv<[ // It would be better to model its preservation semantics properly (create a // vreg on entry, use it in RET & tail call generation; make that vreg def if we // end up saving LR as part of a call frame). Watch this space... -def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, +def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +// Darwin puts the frame-record at the top of the callee-save area. +def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, D8, D9, D10, D11, D12, D13, D14, D15)>; @@ -314,17 +379,24 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x. // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs, // and not (LR,FP) pairs. -def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, +def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, FP, LR, D8, D9, D10, D11, D12, D13, D14, D15)>; // AArch64 PCS for vector functions (VPCS) // must (additionally) preserve full Q8-Q23 registers -def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, +def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, (sequence "Q%u", 8, 23))>; +// Functions taking SVE arguments or returning an SVE type +// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15 +def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, + (sequence "Z%u", 8, 23), + (sequence "P%u", 4, 15))>; + // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, // this can be partially modelled by treating X0 as a callee-saved register; @@ -336,7 +408,7 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; def CSR_AArch64_AAPCS_SwiftError - : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>; + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; // The function used by Darwin to obtain the address of a thread-local variable // guarantees more than a normal AAPCS function. x16 and x17 are used on the @@ -352,7 +424,7 @@ def CSR_AArch64_TLS_Darwin // fast path calls a function that follows CSR_AArch64_TLS_Darwin, // CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. def CSR_AArch64_CXX_TLS_Darwin - : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), (sequence "D%u", 0, 31))>; diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 9f324b433209..35e6fef24363 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -103,6 +103,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -181,6 +182,7 @@ static bool canDefBePartOfLOH(const MachineInstr &MI) { case AArch64::ADDXri: return canAddBePartOfLOH(MI); case AArch64::LDRXui: + case AArch64::LDRWui: // Check immediate to see if the immediate is an address. switch (MI.getOperand(2).getType()) { default: @@ -312,7 +314,8 @@ static void handleUse(const MachineInstr &MI, const MachineOperand &MO, Info.Type = MCLOH_AdrpAdd; Info.IsCandidate = true; Info.MI0 = &MI; - } else if (MI.getOpcode() == AArch64::LDRXui && + } else if ((MI.getOpcode() == AArch64::LDRXui || + MI.getOpcode() == AArch64::LDRWui) && MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) { Info.Type = MCLOH_AdrpLdrGot; Info.IsCandidate = true; @@ -357,7 +360,9 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo, return true; } } else { - assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui"); + assert((MI.getOpcode() == AArch64::LDRXui || + MI.getOpcode() == AArch64::LDRWui) && + "Expect LDRXui or LDRWui"); assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) && "Expected GOT relocation"); if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) { @@ -474,13 +479,23 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) { handleClobber(LOHInfos[Idx]); } // Handle uses. + + SmallSet<int, 4> UsesSeen; for (const MachineOperand &MO : MI.uses()) { if (!MO.isReg() || !MO.readsReg()) continue; int Idx = mapRegToGPRIndex(MO.getReg()); if (Idx < 0) continue; - handleUse(MI, MO, LOHInfos[Idx]); + + // Multiple uses of the same register within a single instruction don't + // count as MultiUser or block optimization. This is especially important on + // arm64_32, where any memory operation is likely to be an explicit use of + // xN and an implicit use of wN (the base address register). + if (!UsesSeen.count(Idx)) { + handleUse(MI, MO, LOHInfos[Idx]); + UsesSeen.insert(Idx); + } } } @@ -512,6 +527,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { switch (Opcode) { case AArch64::ADDXri: case AArch64::LDRXui: + case AArch64::LDRWui: if (canDefBePartOfLOH(MI)) { const MachineOperand &Def = MI.getOperand(0); const MachineOperand &Op = MI.getOperand(1); diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td new file mode 100644 index 000000000000..bb99f2516ecf --- /dev/null +++ b/lib/Target/AArch64/AArch64Combine.td @@ -0,0 +1,18 @@ +//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +def AArch64PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenPreLegalizerCombinerHelper", [all_combines, + elide_br_by_inverting_cond]> { + let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; +} diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp index 453132e09669..25e23e4623de 100644 --- a/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -78,7 +78,7 @@ void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const { } MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) { - if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (!Register::isVirtualRegister(MO.getReg())) return nullptr; return MRI->getUniqueVRegDef(MO.getReg()); } @@ -98,7 +98,7 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI, } bool Is64Bit; unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit); - unsigned NewDestReg = MI.getOperand(0).getReg(); + Register NewDestReg = MI.getOperand(0).getReg(); if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg())) NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR; diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 2cfbcc592d6a..43ae9f8ec47f 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -220,7 +220,7 @@ bool SSACCmpConv::trivialTailPHIs() { // PHI operands come in (VReg, MBB) pairs. for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) { MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB(); - unsigned Reg = I.getOperand(oi).getReg(); + Register Reg = I.getOperand(oi).getReg(); if (MBB == Head) { assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands"); HeadReg = Reg; @@ -259,7 +259,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) { // Writes to the zero register are dead. if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) return true; - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + if (!Register::isVirtualRegister(DstReg)) return false; // A virtual register def without any uses will be marked dead later, and // eventually replaced by the zero register. @@ -631,7 +631,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { } const MCInstrDesc &MCID = TII->get(Opc); // Create a dummy virtual register for the SUBS def. - unsigned DestReg = + Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF)); // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. BuildMI(*Head, Head->end(), TermDL, MCID) diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index a43077cb88ec..bc3808df1dbc 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -145,8 +145,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( continue; // We should not have any relevant physreg defs that are replacable by // zero before register allocation. So we just check for dead vreg defs. - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg) || + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg) || (!MO.isDead() && !MRI->use_nodbg_empty(Reg))) continue; assert(!MO.isImplicit() && "Unexpected implicit def!"); diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 210c10eb1842..082e17e44d04 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -109,7 +109,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize) { MachineInstr &MI = *MBBI; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); uint64_t Imm = MI.getOperand(1).getImm(); if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { @@ -150,7 +150,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, } break; case AArch64::MOVKWi: case AArch64::MOVKXi: { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) .addReg(DstReg, @@ -174,14 +174,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Dest = MI.getOperand(0); - unsigned StatusReg = MI.getOperand(1).getReg(); + Register StatusReg = MI.getOperand(1).getReg(); bool StatusDead = MI.getOperand(1).isDead(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned DesiredReg = MI.getOperand(3).getReg(); - unsigned NewReg = MI.getOperand(4).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register DesiredReg = MI.getOperand(3).getReg(); + Register NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -254,16 +254,16 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( DebugLoc DL = MI.getDebugLoc(); MachineOperand &DestLo = MI.getOperand(0); MachineOperand &DestHi = MI.getOperand(1); - unsigned StatusReg = MI.getOperand(2).getReg(); + Register StatusReg = MI.getOperand(2).getReg(); bool StatusDead = MI.getOperand(2).isDead(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(3).getReg(); - unsigned DesiredLoReg = MI.getOperand(4).getReg(); - unsigned DesiredHiReg = MI.getOperand(5).getReg(); - unsigned NewLoReg = MI.getOperand(6).getReg(); - unsigned NewHiReg = MI.getOperand(7).getReg(); + Register AddrReg = MI.getOperand(3).getReg(); + Register DesiredLoReg = MI.getOperand(4).getReg(); + Register DesiredHiReg = MI.getOperand(5).getReg(); + Register NewLoReg = MI.getOperand(6).getReg(); + Register NewHiReg = MI.getOperand(7).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -475,7 +475,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, case AArch64::LOADgot: { MachineFunction *MF = MBB.getParent(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); const MachineOperand &MO1 = MI.getOperand(1); unsigned Flags = MO1.getTargetFlags(); @@ -495,12 +495,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } } else { // Small codemodel expand into ADRP + LDR. + MachineFunction &MF = *MI.getParent()->getParent(); + DebugLoc DL = MI.getDebugLoc(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) - .add(MI.getOperand(0)) - .addReg(DstReg); + + MachineInstrBuilder MIB2; + if (MF.getSubtarget<AArch64Subtarget>().isTargetILP32()) { + auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + unsigned Reg32 = TRI->getSubReg(DstReg, AArch64::sub_32); + unsigned DstFlags = MI.getOperand(0).getTargetFlags(); + MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRWui)) + .addDef(Reg32) + .addReg(DstReg, RegState::Kill) + .addReg(DstReg, DstFlags | RegState::Implicit); + } else { + unsigned DstReg = MI.getOperand(0).getReg(); + MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui)) + .add(MI.getOperand(0)) + .addUse(DstReg, RegState::Kill); + } if (MO1.isGlobal()) { MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE); @@ -534,11 +548,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, case AArch64::MOVaddrTLS: case AArch64::MOVaddrEXT: { // Expand into ADRP + ADD. - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) .add(MI.getOperand(1)); + if (MI.getOperand(1).getTargetFlags() & AArch64II::MO_TAGGED) { + // MO_TAGGED on the page indicates a tagged address. Set the tag now. + // We do so by creating a MOVK that sets bits 48-63 of the register to + // (global address + 0x100000000 - PC) >> 48. This assumes that we're in + // the small code model so we can assume a binary size of <= 4GB, which + // makes the untagged PC relative offset positive. The binary must also be + // loaded into address range [0, 2^48). Both of these properties need to + // be ensured at runtime when using tagged addresses. + auto Tag = MI.getOperand(1); + Tag.setTargetFlags(AArch64II::MO_PREL | AArch64II::MO_G3); + Tag.setOffset(0x100000000); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi), DstReg) + .addReg(DstReg) + .add(Tag) + .addImm(48); + } + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) .add(MI.getOperand(0)) @@ -561,7 +592,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return true; case AArch64::MOVbaseTLS: { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); auto SysReg = AArch64SysReg::TPIDR_EL0; MachineFunction *MF = MBB.getParent(); if (MF->getTarget().getTargetTriple().isOSFuchsia() && @@ -642,11 +673,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, // instruction sequence. int BaseOffset = -AFI->getTaggedBasePointerOffset(); unsigned FrameReg; - int FrameRegOffset = TFI->resolveFrameOffsetReference( - MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false, + StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( + MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, + /*PreferFP=*/false, /*ForSimm=*/true); Register SrcReg = FrameReg; - if (FrameRegOffset != 0) { + if (FrameRegOffset) { // Use output register as temporary. SrcReg = MI.getOperand(0).getReg(); emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg, diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 3b3182128c4c..b54fc2e51bac 100644 --- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -642,7 +642,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { } // Loads from the stack pointer don't get prefetched. - unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg(); + Register BaseReg = MI.getOperand(BaseRegIdx).getReg(); if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP) return None; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 8dc2768b9597..277a3052f1e5 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO()) return 0; - unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); EVT DestEVT = TLI.getValueType(DL, GV->getType(), true); if (!DestEVT.isSimple()) @@ -474,12 +474,32 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { ADRPReg) .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags); - ResultReg = createResultReg(&AArch64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui), + unsigned LdrOpc; + if (Subtarget->isTargetILP32()) { + ResultReg = createResultReg(&AArch64::GPR32RegClass); + LdrOpc = AArch64::LDRWui; + } else { + ResultReg = createResultReg(&AArch64::GPR64RegClass); + LdrOpc = AArch64::LDRXui; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(LdrOpc), ResultReg) - .addReg(ADRPReg) - .addGlobalAddress(GV, 0, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags); + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC | OpFlags); + if (!Subtarget->isTargetILP32()) + return ResultReg; + + // LDRWui produces a 32-bit register, but pointers in-register are 64-bits + // so we must extend the result on ILP32. + unsigned Result64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(Result64) + .addImm(0) + .addReg(ResultReg, RegState::Kill) + .addImm(AArch64::sub_32); + return Result64; } else { // ADRP + ADDX BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), @@ -504,6 +524,15 @@ unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) { if (!CEVT.isSimple()) return 0; MVT VT = CEVT.getSimpleVT(); + // arm64_32 has 32-bit pointers held in 64-bit registers. Because of that, + // 'null' pointers need to have a somewhat special treatment. + if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) { + (void)CPN; + assert(CPN->getType()->getPointerAddressSpace() == 0 && + "Unexpected address space"); + assert(VT == MVT::i64 && "Expected 64-bit pointers"); + return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT); + } if (const auto *CI = dyn_cast<ConstantInt>(C)) return materializeInt(CI, VT); @@ -946,6 +975,9 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { EVT evt = TLI.getValueType(DL, Ty, true); + if (Subtarget->isTargetILP32() && Ty->isPointerTy()) + return false; + // Only handle simple types. if (evt == MVT::Other || !evt.isSimple()) return false; @@ -988,6 +1020,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const { } bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { + if (Subtarget->isTargetILP32()) + return false; + unsigned ScaleFactor = getImplicitScaleFactor(VT); if (!ScaleFactor) return false; @@ -3165,6 +3200,11 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsTailCall) return false; + // FIXME: we could and should support this, but for now correctness at -O0 is + // more important. + if (Subtarget->isTargetILP32()) + return false; + CodeModel::Model CM = TM.getCodeModel(); // Only support the small-addressing and large code models. if (CM != CodeModel::Large && !Subtarget->useSmallAddressing()) @@ -3434,8 +3474,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { MFI.setFrameAddressIsTaken(true); const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); - unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + Register SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr); // Recursively load frame address @@ -3796,6 +3836,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + // FIXME: in principle it could. Mostly just a case of zero extending outgoing + // pointers. + if (Subtarget->isTargetILP32()) + return false; + if (F.isVarArg()) return false; @@ -3842,7 +3887,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { return false; unsigned SrcReg = Reg + VA.getValNo(); - unsigned DestReg = VA.getLocReg(); + Register DestReg = VA.getLocReg(); // Avoid a cross-class copy. This is very unlikely. if (!MRI.getRegClass(SrcReg)->contains(DestReg)) return false; @@ -3970,7 +4015,7 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) { if (DestVT == MVT::i64) { // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd. - unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), Reg64) .addImm(0) @@ -4123,7 +4168,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { - unsigned TmpReg = MRI.createVirtualRegister(RC); + Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) @@ -4244,7 +4289,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { - unsigned TmpReg = MRI.createVirtualRegister(RC); + Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) @@ -4353,7 +4398,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { - unsigned TmpReg = MRI.createVirtualRegister(RC); + Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) @@ -4412,7 +4457,7 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, if (DestVT == MVT::i8 || DestVT == MVT::i16) DestVT = MVT::i32; else if (DestVT == MVT::i64) { - unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), Src64) .addImm(0) @@ -4495,7 +4540,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, const auto *LoadMI = MI; if (LoadMI->getOpcode() == TargetOpcode::COPY && LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) { - unsigned LoadReg = MI->getOperand(1).getReg(); + Register LoadReg = MI->getOperand(1).getReg(); LoadMI = MRI.getUniqueVRegDef(LoadReg); assert(LoadMI && "Expected valid instruction"); } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 8c6e5cbd5c13..68e1e6a30224 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -44,11 +44,19 @@ // | | // |-----------------------------------| // | | -// | prev_fp, prev_lr | +// | callee-saved gpr registers | <--. +// | | | On Darwin platforms these +// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped, +// | | | (frame record first) +// | prev_fp, prev_lr | <--' // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) // | | -// | other callee-saved registers | +// | callee-saved fp/simd/SVE regs | +// | | +// |-----------------------------------| +// | | +// | SVE stack objects | // | | // |-----------------------------------| // |.empty.space.to.make.part.below....| @@ -80,6 +88,20 @@ // * A frame pointer is definitely needed when there are local variables with // more-than-default alignment requirements. // +// For Darwin platforms the frame-record (fp, lr) is stored at the top of the +// callee-saved area, since the unwind encoding does not allow for encoding +// this dynamically and existing tools depend on this layout. For other +// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved +// area to allow SVE stack objects (allocated directly below the callee-saves, +// if available) to be accessed directly from the framepointer. +// The SVE spill/fill instructions have VL-scaled addressing modes such +// as: +// ldr z8, [fp, #-7 mul vl] +// For SVE the size of the vector length (VL) is not known at compile-time, so +// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this +// layout, we don't need to add an unscaled offset to the framepointer before +// accessing the SVE object in the frame. +// // In some cases when a base pointer is not strictly needed, it is generated // anyway when offsets from the frame pointer to access local variables become // so large that the offset can't be encoded in the immediate fields of loads @@ -94,6 +116,7 @@ #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" +#include "AArch64StackOffset.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -173,7 +196,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { if (!MO.isFI()) continue; - int Offset = 0; + StackOffset Offset; if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == AArch64FrameOffsetCannotUpdate) return 0; @@ -183,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -195,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -273,14 +303,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8}, + TII); } } else if (CalleePopAmount != 0) { // If the calling convention demands that the callee pops arguments from the // stack, we want to add it back if we have a reserved call frame. assert(CalleePopAmount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, - TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + {-(int64_t)CalleePopAmount, MVT::i8}, TII); } return MBB.erase(I); } @@ -416,6 +447,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (MF.getFunction().hasOptSize()) + return false; + if (AFI->getLocalStackSize() == 0) return false; @@ -436,6 +470,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -474,8 +513,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, Imm = -Imm; LLVM_FALLTHROUGH; case AArch64::STPXpre: { - unsigned Reg0 = MBBI->getOperand(1).getReg(); - unsigned Reg1 = MBBI->getOperand(2).getReg(); + Register Reg0 = MBBI->getOperand(1).getReg(); + Register Reg1 = MBBI->getOperand(2).getReg(); if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X)) .addImm(Imm * 8) @@ -523,8 +562,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, } case AArch64::STPXi: case AArch64::LDPXi: { - unsigned Reg0 = MBBI->getOperand(0).getReg(); - unsigned Reg1 = MBBI->getOperand(1).getReg(); + Register Reg0 = MBBI->getOperand(0).getReg(); + Register Reg1 = MBBI->getOperand(1).getReg(); if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR)) .addImm(Imm * 8) @@ -791,6 +830,10 @@ static bool needsWinCFI(const MachineFunction &MF) { F.needsUnwindTableEntry(); } +static bool isTargetDarwin(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin(); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -846,6 +889,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -856,6 +901,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -866,8 +913,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setHasRedZone(true); ++NumRedZoneFunctions; } else { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI); if (!NeedsWinCFI) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); @@ -901,8 +949,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, + NeedsWinCFI, &HasWinCFI); NumBytes = 0; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( @@ -948,9 +998,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP) { - // Only set up FP if we actually need to. Frame pointer is fp = - // sp - fixedobject - 16. - int FPOffset = AFI->getCalleeSavedStackSize() - 16; + // Only set up FP if we actually need to. + int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; + if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -958,8 +1008,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, + {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false, + NeedsWinCFI, &HasWinCFI); } if (windowsRequiresStackProbe(MF, NumBytes)) { @@ -1056,6 +1107,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1071,8 +1125,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { const unsigned Alignment = MFI.getMaxAlignment(); @@ -1130,8 +1185,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); - const int StackGrowth = -TD.getPointerSize(0); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + const int StackGrowth = isTargetDarwin(MF) + ? (2 * -TD.getPointerSize(0)) + : -AFI->getCalleeSavedStackSize(); + Register FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // // .globl __foo @@ -1202,7 +1259,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( - nullptr, Reg, 2 * StackGrowth - FixedObject)); + nullptr, Reg, StackGrowth - FixedObject)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1401,11 +1458,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, - NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI); + {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1416,6 +1476,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1437,8 +1503,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - StackRestoreBytes, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + {StackRestoreBytes, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (Done) { if (NeedsWinCFI) { HasWinCFI = true; @@ -1456,13 +1522,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) + if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { + int64_t OffsetToFrameRecord = + isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0; emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -AFI->getCalleeSavedStackSize() + 16, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); - else if (NumBytes) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); + {OffsetToFrameRecord, MVT::i8}, + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); + } else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI); // This must be placed after the callee-save restore code because that code // assumes the SP is at the same location as it was after the callee-save save @@ -1483,8 +1552,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + {(int64_t)AfterCSRPopSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1501,10 +1570,11 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { return resolveFrameIndexReference( - MF, FI, FrameReg, - /*PreferFP=*/ - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), - /*ForSimm=*/false); + MF, FI, FrameReg, + /*PreferFP=*/ + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), + /*ForSimm=*/false) + .getBytes(); } int AArch64FrameLowering::getNonLocalFrameIndexReference( @@ -1512,18 +1582,19 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference( return getSEHFrameIndexOffset(MF, FI); } -static int getFPOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) { const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - return ObjectOffset + FixedObject + 16; + unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(); + return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; } -static int getStackOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) { const auto &MFI = MF.getFrameInfo(); - return ObjectOffset + MFI.getStackSize(); + return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8}; } int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, @@ -1532,23 +1603,23 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, MF.getSubtarget().getRegisterInfo()); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP - ? getFPOffset(MF, ObjectOffset) - : getStackOffset(MF, ObjectOffset); + ? getFPOffset(MF, ObjectOffset).getBytes() + : getStackOffset(MF, ObjectOffset).getBytes(); } -int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, - int FI, unsigned &FrameReg, - bool PreferFP, - bool ForSimm) const { +StackOffset AArch64FrameLowering::resolveFrameIndexReference( + const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, + bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); int ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg, + bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector; + return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, PreferFP, ForSimm); } -int AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int ObjectOffset, bool isFixed, +StackOffset AArch64FrameLowering::resolveFrameOffsetReference( + const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( @@ -1556,17 +1627,23 @@ int AArch64FrameLowering::resolveFrameOffsetReference( const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - int FPOffset = getFPOffset(MF, ObjectOffset); - int Offset = getStackOffset(MF, ObjectOffset); + int FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); + int Offset = getStackOffset(MF, ObjectOffset).getBytes(); bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the // right thing for the emergency spill slot. bool UseFP = false; - if (AFI->hasStackFrame()) { + if (AFI->hasStackFrame() && !isSVE) { + // We shouldn't prefer using the FP when there is an SVE area + // in between the FP and the non-SVE locals/spills. + PreferFP &= !SVEStackSize; + // Note: Keeping the following as multiple 'if' statements rather than // merging to a single expression for readability. // @@ -1594,8 +1671,10 @@ int AArch64FrameLowering::resolveFrameOffsetReference( bool CanUseBP = RegInfo->hasBasePointer(MF); if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. UseFP = PreferFP; - else if (!CanUseBP) // Can't use BP. Forced to use FP. + else if (!CanUseBP) { // Can't use BP. Forced to use FP. + assert(!SVEStackSize && "Expected BP to be available"); UseFP = true; + } // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. @@ -1625,9 +1704,36 @@ int AArch64FrameLowering::resolveFrameOffsetReference( "In the presence of dynamic stack pointer realignment, " "non-argument/CSR objects cannot be accessed through the frame pointer"); + if (isSVE) { + int64_t OffsetToSVEArea = + MFI.getStackSize() - AFI->getCalleeSavedStackSize(); + StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8}; + StackOffset SPOffset = SVEStackSize + + StackOffset(ObjectOffset, MVT::nxv1i8) + + StackOffset(OffsetToSVEArea, MVT::i8); + // Always use the FP for SVE spills if available and beneficial. + if (hasFP(MF) && + (SPOffset.getBytes() || + FPOffset.getScalableBytes() < SPOffset.getScalableBytes() || + RegInfo->needsStackRealignment(MF))) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + + FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() + : (unsigned)AArch64::SP; + return SPOffset; + } + + StackOffset ScalableOffset = {}; + if (UseFP && !(isFixed || isCSR)) + ScalableOffset = -SVEStackSize; + if (!UseFP && (isFixed || isCSR)) + ScalableOffset = SVEStackSize; + if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return FPOffset; + return StackOffset(FPOffset, MVT::i8) + ScalableOffset; } // Use the base pointer if we have one. @@ -1644,7 +1750,7 @@ int AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return Offset; + return StackOffset(Offset, MVT::i8) + ScalableOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -1682,6 +1788,23 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return true; } +/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction. +/// WindowsCFI requires that only consecutive registers can be paired. +/// LR and FP need to be allocated together when the frame needs to save +/// the frame-record. This means any other register pairing with LR is invalid. +static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, + bool NeedsWinCFI, bool NeedsFrameRecord) { + if (NeedsWinCFI) + return invalidateWindowsRegisterPairing(Reg1, Reg2, true); + + // If we need to store the frame record, don't pair any register + // with LR other than FP. + if (NeedsFrameRecord) + return Reg2 == AArch64::LR; + + return false; +} + namespace { struct RegPairInfo { @@ -1701,7 +1824,7 @@ struct RegPairInfo { static void computeCalleeSaveRegisterPairs( MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, - bool &NeedShadowCallStackProlog) { + bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { if (CSI.empty()) return; @@ -1743,7 +1866,8 @@ static void computeCalleeSaveRegisterPairs( switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI)) + !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + NeedsFrameRecord)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: @@ -1777,6 +1901,10 @@ static void computeCalleeSaveRegisterPairs( (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP || + RPI.Reg1 == AArch64::LR) && + "FrameRecord must be allocated together with LR"); + // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. assert((!produceCompactUnwindFrame(MF) || @@ -1825,7 +1953,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog); + NeedShadowCallStackProlog, hasFP(MF)); const MachineRegisterInfo &MRI = MF.getRegInfo(); if (NeedShadowCallStackProlog) { @@ -1955,7 +2083,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog); + NeedShadowCallStackProlog, hasFP(MF)); auto EmitMI = [&](const RegPairInfo &RPI) { unsigned Reg1 = RPI.Reg1; @@ -2113,19 +2241,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::LR); } - LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) dbgs() << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + unsigned MaxAlign = getStackAlignment(); + int64_t SVEStackSize = + alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); - bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; + + // Conservatively always assume BigStack when there are SVE spills. + bool BigStack = SVEStackSize || + (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -2145,7 +2280,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // store the pair. if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); - ExtraCSSpill = UnspilledCSGPRPaired; + ExtraCSSpill = UnspilledCSGPR; } // If we didn't find an extra callee-saved register to spill, create @@ -2181,14 +2316,42 @@ bool AArch64FrameLowering::enableStackSlotScavenging( return AFI->hasCalleeSaveStackFreeSpace(); } +int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, + unsigned &MaxAlign) const { + // Process all fixed stack objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return Offset; +} + void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + unsigned MaxAlign = getStackAlignment(); + int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign)); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 6dbd34b2189f..ac150e86c9eb 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H +#include "AArch64StackOffset.h" #include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -20,7 +21,7 @@ namespace llvm { class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() - : TargetFrameLowering(StackGrowsDown, 16, 0, 16, + : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16), true /*StackRealignable*/) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, @@ -39,12 +40,13 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - int resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, bool PreferFP, - bool ForSimm) const; - int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset, - bool isFixed, unsigned &FrameReg, - bool PreferFP, bool ForSimm) const; + StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg, bool PreferFP, + bool ForSimm) const; + StackOffset resolveFrameOffsetReference(const MachineFunction &MF, + int ObjectOffset, bool isFixed, + bool isSVE, unsigned &FrameReg, + bool PreferFP, bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -85,9 +87,21 @@ public: int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; + bool isSupportedStackID(TargetStackID::Value ID) const override { + switch (ID) { + default: + return false; + case TargetStackID::Default: + case TargetStackID::SVEVector: + case TargetStackID::NoAlloc: + return true; + } + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; + int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const; }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index cd7e927ac80c..1f08505f37e7 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2053,7 +2053,7 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, } static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { - if (Depth >= 6) + if (Depth >= SelectionDAG::MaxRecursionDepth) return; // Initialize UsefulBits if (!Depth) { @@ -2913,49 +2913,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; break; - case ISD::EXTRACT_VECTOR_ELT: { - // Extracting lane zero is a special case where we can just use a plain - // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for - // the rest of the compiler, especially the register allocator and copyi - // propagation, to reason about, so is preferred when it's possible to - // use it. - ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1)); - // Bail and use the default Select() for non-zero lanes. - if (LaneNode->getZExtValue() != 0) - break; - // If the element type is not the same as the result type, likewise - // bail and use the default Select(), as there's more to do than just - // a cross-class COPY. This catches extracts of i8 and i16 elements - // since they will need an explicit zext. - if (VT != Node->getOperand(0).getValueType().getVectorElementType()) - break; - unsigned SubReg; - switch (Node->getOperand(0) - .getValueType() - .getVectorElementType() - .getSizeInBits()) { - default: - llvm_unreachable("Unexpected vector element type!"); - case 64: - SubReg = AArch64::dsub; - break; - case 32: - SubReg = AArch64::ssub; - break; - case 16: - SubReg = AArch64::hsub; - break; - case 8: - llvm_unreachable("unexpected zext-requiring extract element!"); - } - SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT, - Node->getOperand(0)); - LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); - LLVM_DEBUG(Extract->dumpr(CurDAG)); - LLVM_DEBUG(dbgs() << "\n"); - ReplaceNode(Node, Extract.getNode()); - return; - } case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 7becc99fb5c7..2746117e8ee5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -161,6 +162,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); } + if (Subtarget->hasSVE()) { + // Add legal sve predicate types + addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); + + // Add legal sve data types + addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); + + addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + } + // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -283,7 +307,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } @@ -297,7 +321,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } @@ -606,6 +630,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; + MaxLoadsPerMemcmpOptSize = 4; + MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() + ? MaxLoadsPerMemcmpOptSize : 8; + setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); @@ -613,10 +641,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, EnableExtLdPromotion = true; // Set required alignment. - setMinFunctionAlignment(2); + setMinFunctionAlignment(Align(4)); // Set preferred alignments. - setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); - setPrefLoopAlignment(STI.getPrefLoopAlignment()); + setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); + setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); // Only change the limit for entries in a jump table if specified by // the sub target, but not at the command line. @@ -725,7 +753,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { @@ -741,7 +769,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); @@ -773,6 +801,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } + if (Subtarget->hasSVE()) { + for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { + if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1) + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + } + } + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } @@ -1025,6 +1060,14 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( Known.One &= Known2.One; break; } + case AArch64ISD::LOADgot: + case AArch64ISD::ADDlow: { + if (!Subtarget->isTargetILP32()) + break; + // In ILP32 mode all valid pointers are in the low 4GB of the address-space. + Known.Zero = APInt::getHighBitsSet(64, 32); + break; + } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); @@ -1100,6 +1143,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( return true; } +// Same as above but handling LLTs instead. +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + if (Fast) { + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || + Ty.getSizeInBytes() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + Ty == LLT::vector(2, 64); + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -1238,6 +1307,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::STZG: return "AArch64ISD::STZG"; case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; + case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI"; + case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; + case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; + case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; } return nullptr; } @@ -1263,9 +1336,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned IfTrueReg = MI.getOperand(1).getReg(); - unsigned IfFalseReg = MI.getOperand(2).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register IfTrueReg = MI.getOperand(1).getReg(); + Register IfFalseReg = MI.getOperand(2).getReg(); unsigned CondCode = MI.getOperand(3).getImm(); bool NZCVKilled = MI.getOperand(4).isKill(); @@ -2140,7 +2213,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; } // Returns true if the given Op is the overflow flag result of an overflow @@ -2349,7 +2423,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions, SDLoc(Op)).first; } @@ -2419,7 +2494,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2773,6 +2849,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_sunpkhi: + return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_sunpklo: + return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uunpkhi: + return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uunpklo: + return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); const auto *RegInfo = Subtarget->getRegisterInfo(); @@ -2937,6 +3026,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SPLAT_VECTOR: + return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::SRA: @@ -3014,8 +3105,11 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; - return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; - case CallingConv::Win64: + if (!IsVarArg) + return CC_AArch64_DarwinPCS; + return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg + : CC_AArch64_DarwinPCS_VarArg; + case CallingConv::Win64: return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; case CallingConv::AArch64_VectorCall: return CC_AArch64_AAPCS; @@ -3038,6 +3132,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; + DenseMap<unsigned, SDValue> CopiedRegs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); @@ -3094,11 +3189,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( continue; } + SDValue ArgValue; if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); - - SDValue ArgValue; const TargetRegisterClass *RC; if (RegVT == MVT::i32) @@ -3113,6 +3207,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; + else if (RegVT.isScalableVector() && + RegVT.getVectorElementType() == MVT::i1) + RC = &AArch64::PPRRegClass; + else if (RegVT.isScalableVector()) + RC = &AArch64::ZPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -3128,20 +3227,23 @@ SDValue AArch64TargetLowering::LowerFormalArguments( llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: - // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt - // nodes after our lowering. - assert(RegVT == Ins[i].VT && "incorrect register location selected"); + break; + case CCValAssign::AExtUpper: + ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, + DAG.getConstant(32, DL, RegVT)); + ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); break; } - - InVals.push_back(ArgValue); - } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); @@ -3156,7 +3258,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; @@ -3165,9 +3266,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( switch (VA.getLocInfo()) { default: break; + case CCValAssign::Trunc: case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3184,8 +3290,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); - InVals.push_back(ArgValue); } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) + ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), + ArgValue, DAG.getValueType(MVT::i32)); + InVals.push_back(ArgValue); } // varargs @@ -3202,8 +3311,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); - // We currently pass all varargs at 8-byte alignment. - StackOffset = ((StackOffset + 7) & ~7); + // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 + StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); if (MFI.hasMustTailInVarArgFunc()) { @@ -3233,8 +3342,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( assert(!FuncInfo->getSRetReturnReg()); MVT PtrTy = getPointerTy(DAG.getDataLayout()); - unsigned Reg = - MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + Register Reg = + MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); @@ -3366,6 +3475,7 @@ SDValue AArch64TargetLowering::LowerCallResult( : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; + DenseMap<unsigned, SDValue> CopiedRegs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); @@ -3383,10 +3493,16 @@ SDValue AArch64TargetLowering::LowerCallResult( continue; } - SDValue Val = - DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); + // Avoid copying a physreg twice since RegAllocFast is incompetent and only + // allows one use of a physreg per block. + SDValue Val = CopiedRegs.lookup(VA.getLocReg()); + if (!Val) { + Val = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + CopiedRegs[VA.getLocReg()] = Val; + } switch (VA.getLocInfo()) { default: @@ -3396,6 +3512,15 @@ SDValue AArch64TargetLowering::LowerCallResult( case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; + case CCValAssign::AExtUpper: + Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, + DAG.getConstant(32, DL, VA.getLocVT())); + LLVM_FALLTHROUGH; + case CCValAssign::AExt: + LLVM_FALLTHROUGH; + case CCValAssign::ZExt: + Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); + break; } InVals.push_back(Val); @@ -3593,6 +3718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -3709,6 +3835,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getPointerTy(DAG.getDataLayout())); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallSet<unsigned, 8> RegsUsed; SmallVector<SDValue, 8> MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -3716,7 +3843,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + RegsToPass.emplace_back(F.PReg, Val); } } @@ -3747,12 +3874,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::AExtUpper: + assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, + DAG.getConstant(32, DL, VA.getLocVT())); + break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + Arg = DAG.getBitcast(VA.getLocVT(), Arg); + break; + case CCValAssign::Trunc: + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); } if (VA.isRegLoc()) { @@ -3764,7 +3904,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, "unexpected use of 'returned'"); IsThisReturn = true; } - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (RegsUsed.count(VA.getLocReg())) { + // If this register has already been used then we're trying to pack + // parts of an [N x i32] into an X-register. The extension type will + // take care of putting the two halves in the right place but we have to + // combine them. + SDValue &Bits = + std::find_if(RegsToPass.begin(), RegsToPass.end(), + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) + ->second; + Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); + // Call site info is used for function's parameter entry value + // tracking. For now we track only simple cases when parameter + // is transferred through whole register. + CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(), + [&VA](MachineFunction::ArgRegPair ArgReg) { + return ArgReg.Reg == VA.getLocReg(); + }), + CSInfo.end()); + } else { + RegsToPass.emplace_back(VA.getLocReg(), Arg); + RegsUsed.insert(VA.getLocReg()); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), i); + } } else { assert(VA.isMemLoc()); @@ -3899,6 +4065,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); + // Check callee args/returns for SVE registers and set calling convention + // accordingly. + if (CallConv == CallingConv::C) { + bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ + return Out.VT.isScalableVector(); + }); + bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ + return In.VT.isScalableVector(); + }); + + if (CalleeInSVE || CalleeOutSVE) + CallConv = CallingConv::AArch64_SVE_VectorCall; + } + // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -3930,12 +4110,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // actual call instruction. if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; @@ -3983,7 +4166,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Copy the result values into the output registers. SDValue Flag; - SmallVector<SDValue, 4> RetOps(1, Chain); + SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; + SmallSet<unsigned, 4> RegsUsed; for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; @@ -4005,11 +4189,38 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; + case CCValAssign::AExt: + case CCValAssign::ZExt: + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); + break; + case CCValAssign::AExtUpper: + assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); + Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, + DAG.getConstant(32, DL, VA.getLocVT())); + break; } - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + if (RegsUsed.count(VA.getLocReg())) { + SDValue &Bits = + std::find_if(RetVals.begin(), RetVals.end(), + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) + ->second; + Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); + } else { + RetVals.emplace_back(VA.getLocReg(), Arg); + RegsUsed.insert(VA.getLocReg()); + } + } + + SmallVector<SDValue, 4> RetOps(1, Chain); + for (auto &RetVal : RetVals) { + Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + RetOps.push_back( + DAG.getRegister(RetVal.first, RetVal.second.getValueType())); } // Windows AArch64 ABIs require that for returning structs by value we copy @@ -4139,8 +4350,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); - unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); if (OpFlags != AArch64II::MO_NO_FLAG) assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && @@ -4204,6 +4414,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); SDValue TLVPAddr = @@ -4214,13 +4425,15 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( - MVT::i64, DL, Chain, DescAddr, + PtrMemVT, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ 8, - MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable); + /* Alignment = */ PtrMemVT.getSizeInBits() / 8, + MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); + // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. + FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); @@ -4470,7 +4683,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -4736,7 +4949,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { @@ -4798,7 +5011,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -5096,6 +5309,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); + FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); @@ -5202,15 +5416,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = - Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; + unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; + unsigned VaListSize = (Subtarget->isTargetDarwin() || + Subtarget->isTargetWindows()) ? PtrSize : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), - Op.getOperand(2), - DAG.getConstant(VaListSize, DL, MVT::i32), - 8, false, false, false, MachinePointerInfo(DestSV), + return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize, + false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } @@ -5224,12 +5438,15 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); + unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); - - SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); + auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); + SDValue VAList = + DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); + VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); - if (Align > 8) { + if (Align > MinSlotSize) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); @@ -5238,14 +5455,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); - uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); + unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) - ArgSize = 8; + ArgSize = std::max(ArgSize, MinSlotSize); bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; @@ -5255,6 +5472,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); + VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); + // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); @@ -5284,10 +5503,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SDLoc DL(Op); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = - DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); + DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); + + if (Subtarget->isTargetILP32()) + FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, + DAG.getValueType(VT)); + return FrameAddr; } @@ -5306,9 +5530,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = MatchRegisterName(RegName); +Register AArch64TargetLowering:: +getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const { + Register Reg = MatchRegisterName(RegName); if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); @@ -5653,6 +5877,21 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { return "r"; } +enum PredicateConstraint { + Upl, + Upa, + Invalid +}; + +static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { + PredicateConstraint P = PredicateConstraint::Invalid; + if (Constraint == "Upa") + P = PredicateConstraint::Upa; + if (Constraint == "Upl") + P = PredicateConstraint::Upl; + return P; +} + /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType @@ -5661,19 +5900,30 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const { switch (Constraint[0]) { default: break; - case 'z': - return C_Other; case 'x': case 'w': + case 'y': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'Y': + case 'Z': + return C_Immediate; + case 'z': case 'S': // A symbolic address return C_Other; } - } + } else if (parsePredicateConstraint(Constraint) != + PredicateConstraint::Invalid) + return C_RegisterClass; return TargetLowering::getConstraintType(Constraint); } @@ -5697,12 +5947,17 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( break; case 'x': case 'w': + case 'y': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; + case 'U': + if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) + weight = CW_Register; + break; } return weight; } @@ -5719,6 +5974,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( case 'w': if (!Subtarget->hasFPARMv8()) break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPRRegClass); if (VT.getSizeInBits() == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); if (VT.getSizeInBits() == 32) @@ -5733,9 +5990,25 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( case 'x': if (!Subtarget->hasFPARMv8()) break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPR_4bRegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; + case 'y': + if (!Subtarget->hasFPARMv8()) + break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPR_3bRegClass); + break; + } + } else { + PredicateConstraint PC = parsePredicateConstraint(Constraint); + if (PC != PredicateConstraint::Invalid) { + assert(VT.isScalableVector()); + bool restricted = (PC == PredicateConstraint::Upl); + return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) + : std::make_pair(0U, &AArch64::PPRRegClass); } } if (StringRef("{cc}").equals_lower(Constraint)) @@ -6279,6 +6552,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { @@ -6446,8 +6721,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); - EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements() / 2); + EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); @@ -6790,6 +7064,41 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return GenerateTBL(Op, ShuffleMask, DAG); } +SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT ElemVT = VT.getScalarType(); + + SDValue SplatVal = Op.getOperand(0); + + // Extend input splat value where needed to fit into a GPR (32b or 64b only) + // FPRs don't have this restriction. + switch (ElemVT.getSimpleVT().SimpleTy) { + case MVT::i8: + case MVT::i16: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); + break; + case MVT::i64: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + break; + case MVT::i32: + // Fine as is + break; + // TODO: we can support splats of i1s and float types, but haven't added + // patterns yet. + case MVT::i1: + case MVT::f16: + case MVT::f32: + case MVT::f64: + default: + llvm_unreachable("Unsupported SPLAT_VECTOR input operand type"); + break; + } + + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); +} + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); @@ -8063,7 +8372,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -8089,7 +8398,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -8101,7 +8410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -8112,7 +8421,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -8122,7 +8431,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 16; + Info.align = Align(16); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: @@ -8131,7 +8440,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = 16; + Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; default: @@ -8278,7 +8587,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = - countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; + countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) @@ -8739,6 +9048,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType( return MVT::Other; } +LLT AArch64TargetLowering::getOptimalMemOpLLT( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + bool CanImplicitFloat = + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; + bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, + &Fast) && + Fast; + }; + + if (CanUseNEON && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return LLT::vector(2, 64); + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return LLT::scalar(128); + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + return LLT::scalar(64); + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + return LLT::scalar(32); + return LLT(); +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits<int64_t>::min()) { @@ -10065,6 +10407,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: + // For positive shift amounts we can use SHL, as ushl/sshl perform a regular + // left shift for positive shift amounts. Below, we only replace the current + // node with VSHL, if this condition is met. + Opcode = AArch64ISD::VSHL; + IsRightShift = false; + break; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { @@ -10151,6 +10501,8 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: @@ -10482,10 +10834,10 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return ReplacedSplat; SDLoc DL(S); - unsigned NumElts = VT.getVectorNumElements() / 2; + // Split VT into two. - EVT HalfVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NumElts = HalfVT.getVectorNumElements(); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, @@ -10567,7 +10919,7 @@ static SDValue performPostLD1Combine(SDNode *N, // are predecessors to each other or the Vector. SmallPtrSet<const SDNode *, 32> Visited; SmallVector<const SDNode *, 16> Worklist; - Visited.insert(N); + Visited.insert(Addr.getNode()); Worklist.push_back(User); Worklist.push_back(LD); Worklist.push_back(Vector.getNode()); @@ -11983,6 +12335,27 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( return Mask->getValue().isPowerOf2(); } +bool AArch64TargetLowering:: + shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const { + // Does baseline recommend not to perform the fold by default? + if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) + return false; + // Else, if this is a vector shift, prefer 'shl'. + return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; +} + +bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget->isTargetWindows()) + return false; + return true; +} + void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); @@ -12009,7 +12382,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 4421c31f65c9..00fa96bc4e6d 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -191,6 +191,11 @@ enum NodeType : unsigned { FRECPE, FRECPS, FRSQRTE, FRSQRTS, + SUNPKHI, + SUNPKLO, + UUNPKHI, + UUNPKLO, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -261,6 +266,14 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; + MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override { + // Returning i64 unconditionally here (i.e. even for ILP32) means that the + // *DAG* representation of pointers will always be 64-bits. They will be + // truncated and extended when transferred to memory, but the 64-bit DAG + // allows us to use AArch64's addressing modes much more easily. + return MVT::getIntegerVT(64); + } + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; @@ -272,6 +285,10 @@ public: EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; + /// LLT variant. + bool allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -358,6 +375,10 @@ public: bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const override; + /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, @@ -480,11 +501,12 @@ public: return VT.getSizeInBits() >= 64; // vector 'bic' } - bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return false; - return true; - } + bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const override; + + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { @@ -655,6 +677,7 @@ private: SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; @@ -690,8 +713,8 @@ private: unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index e22cb44d81ae..459b53923625 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -204,19 +204,27 @@ def : Pat<(relaxed_store<atomic_store_64> def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(ldxr_1 GPR64sp:$addr), (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; @@ -237,19 +245,27 @@ def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff), def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(ldaxr_1 GPR64sp:$addr), (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; @@ -271,22 +287,30 @@ def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff), def stxr_1 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def stxr_2 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def stxr_4 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def stxr_8 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr), @@ -317,22 +341,30 @@ def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), def stlxr_1 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def stlxr_2 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def stlxr_4 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def stlxr_8 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr), @@ -422,4 +454,3 @@ let Predicates = [HasLSE] in { defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">; defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">; } - diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index d619137b55c5..f555e4123307 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -480,76 +480,40 @@ def BranchTarget14Operand : BranchTarget<14>; def BranchTarget26Operand : BranchTarget<26>; def PCRelLabel19Operand : PCRelLabel<19>; -def MovZSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG3"; +def MovWSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG3"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g3 : Operand<i32> { - let ParserMatchClass = MovZSymbolG3AsmOperand; +def movw_symbol_g3 : Operand<i32> { + let ParserMatchClass = MovWSymbolG3AsmOperand; } -def MovZSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG2"; +def MovWSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG2"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g2 : Operand<i32> { - let ParserMatchClass = MovZSymbolG2AsmOperand; +def movw_symbol_g2 : Operand<i32> { + let ParserMatchClass = MovWSymbolG2AsmOperand; } -def MovZSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG1"; +def MovWSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG1"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g1 : Operand<i32> { - let ParserMatchClass = MovZSymbolG1AsmOperand; +def movw_symbol_g1 : Operand<i32> { + let ParserMatchClass = MovWSymbolG1AsmOperand; } -def MovZSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG0"; +def MovWSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG0"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g0 : Operand<i32> { - let ParserMatchClass = MovZSymbolG0AsmOperand; -} - -def MovKSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG3"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g3 : Operand<i32> { - let ParserMatchClass = MovKSymbolG3AsmOperand; -} - -def MovKSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG2"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g2 : Operand<i32> { - let ParserMatchClass = MovKSymbolG2AsmOperand; -} - -def MovKSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG1"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g1 : Operand<i32> { - let ParserMatchClass = MovKSymbolG1AsmOperand; -} - -def MovKSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG0"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g0 : Operand<i32> { - let ParserMatchClass = MovKSymbolG0AsmOperand; +def movw_symbol_g0 : Operand<i32> { + let ParserMatchClass = MovWSymbolG0AsmOperand; } class fixedpoint_i32<ValueType FloatVT> @@ -673,6 +637,11 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>; +def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">, + GISDNodeXFormEquiv<logical_imm32_XFORM>; +def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">, + GISDNodeXFormEquiv<logical_imm64_XFORM>; + let DiagnosticType = "LogicalSecondSource" in { def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; @@ -714,12 +683,15 @@ def logical_imm64_not : Operand<i64> { let ParserMatchClass = LogicalImm64NotOperand; } -// imm0_65535 predicate - True if the immediate is in the range [0,65535]. -def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ +// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535]. +let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in { +def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{ return ((uint32_t)Imm) < 65536; -}]> { - let ParserMatchClass = AsmImmRange<0, 65535>; - let PrintMethod = "printImmHex"; +}]>; + +def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{ + return ((uint64_t)Imm) < 65536; +}]>; } // imm0_255 predicate - True if the immediate is in the range [0,255]. @@ -815,6 +787,14 @@ class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width> def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>; def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>; +def gi_arith_shifted_reg32 : + GIComplexOperandMatcher<s32, "selectArithShiftedRegister">, + GIComplexPatternEquiv<arith_shifted_reg32>; + +def gi_arith_shifted_reg64 : + GIComplexOperandMatcher<s64, "selectArithShiftedRegister">, + GIComplexPatternEquiv<arith_shifted_reg64>; + // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror // {5-0} - imm6 @@ -837,6 +817,14 @@ class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop> def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>; def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>; +def gi_logical_shifted_reg32 : + GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">, + GIComplexPatternEquiv<logical_shifted_reg32>; + +def gi_logical_shifted_reg64 : + GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">, + GIComplexPatternEquiv<logical_shifted_reg64>; + // A logical vector shifter operand: // {7-6} - shift type: 00 = lsl // {5-0} - imm6: #0, #8, #16, or #24 @@ -918,6 +906,14 @@ class neg_addsub_shifted_imm<ValueType Ty> def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>; def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>; +def gi_neg_addsub_shifted_imm32 : + GIComplexOperandMatcher<s32, "selectNegArithImmed">, + GIComplexPatternEquiv<neg_addsub_shifted_imm32>; + +def gi_neg_addsub_shifted_imm64 : + GIComplexOperandMatcher<s64, "selectNegArithImmed">, + GIComplexPatternEquiv<neg_addsub_shifted_imm64>; + // An extend operand: // {5-3} - extend type // {2-0} - imm3 @@ -948,6 +944,21 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, let MIOperandInfo = (ops GPR32, arith_extend64); } +def arith_extended_reg32_i32 : arith_extended_reg32<i32>; +def gi_arith_extended_reg32_i32 : + GIComplexOperandMatcher<s32, "selectArithExtendedRegister">, + GIComplexPatternEquiv<arith_extended_reg32_i32>; + +def arith_extended_reg32_i64 : arith_extended_reg32<i64>; +def gi_arith_extended_reg32_i64 : + GIComplexOperandMatcher<s64, "selectArithExtendedRegister">, + GIComplexPatternEquiv<arith_extended_reg32_i64>; + +def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>; +def gi_arith_extended_reg32to64_i64 : + GIComplexOperandMatcher<s64, "selectArithExtendedRegister">, + GIComplexPatternEquiv<arith_extended_reg32to64_i64>; + // Floating-point immediate. def fpimm16 : Operand<f16>, FPImmLeaf<f16, [{ @@ -1000,8 +1011,8 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass { let RenderMethod = "addVectorIndexOperands"; } -class AsmVectorIndexOpnd<AsmOperandClass mc, code pred> - : Operand<i64>, ImmLeaf<i64, pred> { +class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred> + : Operand<ty>, ImmLeaf<ty, pred> { let ParserMatchClass = mc; let PrintMethod = "printVectorIndex"; } @@ -1012,11 +1023,17 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; -def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; -def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; -def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; -def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; -def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; +def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; +def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; +def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; +def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; +def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; + +def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; +def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; +def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; +def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; +def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">; def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">; @@ -1025,15 +1042,15 @@ def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">; def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">; def sve_elm_idx_extdup_b - : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>; def sve_elm_idx_extdup_h - : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>; def sve_elm_idx_extdup_s - : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>; def sve_elm_idx_extdup_d - : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>; def sve_elm_idx_extdup_q - : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>; // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh @@ -1082,6 +1099,45 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands> let Inst{4-0} = Rt; } +// System instructions for transactional memory extension +class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops, + string asm, string operands, list<dag> pattern> + : BaseSystemI<L, oops, iops, asm, operands, pattern>, + Sched<[WriteSys]> { + let Inst{20-12} = 0b000110011; + let Inst{11-8} = CRm; + let Inst{7-5} = op2; + let DecoderMethod = ""; + + let mayLoad = 1; + let mayStore = 1; +} + +// System instructions for transactional memory - single input operand +class TMSystemI<bits<4> CRm, string asm, list<dag> pattern> + : TMBaseSystemI<0b1, CRm, 0b011, + (outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> { + bits<5> Rt; + let Inst{4-0} = Rt; +} + +// System instructions for transactional memory - no operand +class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern> + : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> { + let Inst{4-0} = 0b11111; +} + +// System instructions for exit from transactions +class TMSystemException<bits<3> op1, string asm, list<dag> pattern> + : I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>, + Sched<[WriteSys]> { + bits<16> imm; + let Inst{31-24} = 0b11010100; + let Inst{23-21} = op1; + let Inst{20-5} = imm; + let Inst{4-0} = 0b00000; +} + // Hint instructions that take both a CRm and a 3-bit immediate. // NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot // model patterns with sufficiently fine granularity @@ -2180,11 +2236,11 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, // Add/Subtract extended register let AddedComplexity = 1, hasSideEffects = 0 in { def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp, - arith_extended_reg32<i32>, mnemonic, OpNode> { + arith_extended_reg32_i32, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp, - arith_extended_reg32to64<i64>, mnemonic, OpNode> { + arith_extended_reg32to64_i64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2254,11 +2310,11 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, // Add/Subtract extended register let AddedComplexity = 1 in { def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp, - arith_extended_reg32<i32>, mnemonic, OpNode> { + arith_extended_reg32_i32, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp, - arith_extended_reg32<i64>, mnemonic, OpNode> { + arith_extended_reg32_i64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2969,6 +3025,22 @@ def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>; def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>; def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>; +def gi_ro_Xindexed8 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">, + GIComplexPatternEquiv<ro_Xindexed8>; +def gi_ro_Xindexed16 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">, + GIComplexPatternEquiv<ro_Xindexed16>; +def gi_ro_Xindexed32 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">, + GIComplexPatternEquiv<ro_Xindexed32>; +def gi_ro_Xindexed64 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">, + GIComplexPatternEquiv<ro_Xindexed64>; +def gi_ro_Xindexed128 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">, + GIComplexPatternEquiv<ro_Xindexed128>; + def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>; def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>; def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>; @@ -4086,7 +4158,7 @@ multiclass MemTagStore<bits<2> opc1, string insn> { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm> - : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, + : I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>, Sched<[WriteSys]> { bits<16> imm; let Inst{31-24} = 0b11010100; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 215e96a82d0e..5c35e5bcdd30 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Casting.h" @@ -82,6 +83,10 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); } + // Meta-instructions emit no code. + if (MI.isMetaInstruction()) + return 0; + // FIXME: We currently only handle pseudoinstructions that don't get expanded // before the assembly printer. unsigned NumBytes = 0; @@ -91,12 +96,6 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // Anything not explicitly designated otherwise is a normal 4-byte insn. NumBytes = 4; break; - case TargetOpcode::DBG_VALUE: - case TargetOpcode::EH_LABEL: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - NumBytes = 0; - break; case TargetOpcode::STACKMAP: // The upper bound for a stackmap intrinsic is the full length of its shadow NumBytes = StackMapOpers(&MI).getNumPatchBytes(); @@ -416,7 +415,7 @@ unsigned AArch64InstrInfo::insertBranch( // Find the original register that VReg is copied from. static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { - while (TargetRegisterInfo::isVirtualRegister(VReg)) { + while (Register::isVirtualRegister(VReg)) { const MachineInstr *DefMI = MRI.getVRegDef(VReg); if (!DefMI->isFullCopy()) return VReg; @@ -431,7 +430,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewVReg = nullptr) { VReg = removeCopies(MRI, VReg); - if (!TargetRegisterInfo::isVirtualRegister(VReg)) + if (!Register::isVirtualRegister(VReg)) return 0; bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); @@ -574,7 +573,7 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, CC = AArch64CC::NE; break; } - unsigned SrcReg = Cond[2].getReg(); + Register SrcReg = Cond[2].getReg(); if (Is64Bit) { // cmp reg, #0 is actually subs xzr, reg, #0. MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); @@ -930,7 +929,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( - const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const { + const MachineInstr &MIa, const MachineInstr &MIb) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; @@ -1071,8 +1070,8 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) { assert(MO.isReg() && "Operand has register constraints without being a register!"); - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) { if (!OpRegCstraints->contains(Reg)) return false; } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && @@ -1472,6 +1471,8 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return false; MachineBasicBlock &MBB = *MI.getParent(); + auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); + auto TRI = Subtarget.getRegisterInfo(); DebugLoc DL = MI.getDebugLoc(); if (MI.getOpcode() == AArch64::CATCHRET) { @@ -1497,21 +1498,32 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MI.memoperands_begin())->getValue()); const TargetMachine &TM = MBB.getParent()->getTarget(); - unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); + unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); const unsigned char MO_NC = AArch64II::MO_NC; if ((OpFlags & AArch64II::MO_GOT) != 0) { BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) .addGlobalAddress(GV, 0, OpFlags); - BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill) - .addImm(0) - .addMemOperand(*MI.memoperands_begin()); + if (Subtarget.isTargetILP32()) { + unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); + BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) + .addDef(Reg32, RegState::Dead) + .addUse(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()) + .addDef(Reg, RegState::Implicit); + } else { + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); + } } else if (TM.getCodeModel() == CodeModel::Large) { + assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) .addImm(0); @@ -1538,10 +1550,20 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; - BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, LoFlags) - .addMemOperand(*MI.memoperands_begin()); + if (Subtarget.isTargetILP32()) { + unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); + BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) + .addDef(Reg32, RegState::Dead) + .addUse(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, LoFlags) + .addMemOperand(*MI.memoperands_begin()) + .addDef(Reg, RegState::Implicit); + } else { + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, LoFlags) + .addMemOperand(*MI.memoperands_begin()); + } } MBB.erase(MI); @@ -1581,7 +1603,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { break; case TargetOpcode::COPY: { // GPR32 copies will by lowered to ORRXrs - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); return (AArch64::GPR32RegClass.contains(DstReg) || AArch64::GPR64RegClass.contains(DstReg)); } @@ -1611,7 +1633,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { break; case TargetOpcode::COPY: { // FPR64 copies will by lowered to ORR.16b - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); return (AArch64::FPR64RegClass.contains(DstReg) || AArch64::FPR128RegClass.contains(DstReg)); } @@ -1917,7 +1939,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { // e.g., ldr x0, [x0] // This case will never occur with an FI base. if (MI.getOperand(1).isReg()) { - unsigned BaseReg = MI.getOperand(1).getReg(); + Register BaseReg = MI.getOperand(1).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (MI.modifiesRegister(BaseReg, TRI)) return false; @@ -1928,6 +1950,17 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { if (isLdStPairSuppressed(MI)) return false; + // Do not pair any callee-save store/reload instructions in the + // prologue/epilogue if the CFI information encoded the operations as separate + // instructions, as that will cause the size of the actual prologue to mismatch + // with the prologue size recorded in the Windows CFI. + const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); + bool NeedsWinCFI = MAI->usesWindowsCFI() && + MI.getMF()->getFunction().needsUnwindTableEntry(); + if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy))) + return false; + // On some CPUs quad load/store pairs are slower than two single load/stores. if (Subtarget.isPaired128Slow()) { switch (MI.getOpcode()) { @@ -2165,6 +2198,18 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, MinOffset = -256; MaxOffset = 255; break; + case AArch64::LDR_PXI: + case AArch64::STR_PXI: + Scale = Width = 2; + MinOffset = -256; + MaxOffset = 255; + break; + case AArch64::LDR_ZXI: + case AArch64::STR_ZXI: + Scale = Width = 16; + MinOffset = -256; + MaxOffset = 255; + break; case AArch64::ST2GOffset: case AArch64::STZ2GOffset: Scale = 16; @@ -2350,7 +2395,7 @@ static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, if (!SubIdx) return MIB.addReg(Reg, State); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } @@ -2474,6 +2519,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + // Copy a Predicate register by ORRing with itself. + if (AArch64::PPRRegClass.contains(DestReg) && + AArch64::PPRRegClass.contains(SrcReg)) { + assert(Subtarget.hasSVE() && "Unexpected SVE register."); + BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) + .addReg(SrcReg) // Pg + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + // Copy a Z register by ORRing with itself. + if (AArch64::ZPRRegClass.contains(DestReg) && + AArch64::ZPRRegClass.contains(SrcReg)) { + assert(Subtarget.hasSVE() && "Unexpected SVE register."); + BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (AArch64::GPR64spRegClass.contains(DestReg) && (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { @@ -2722,7 +2788,7 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineMemOperand *MMO) { unsigned SrcReg0 = SrcReg; unsigned SrcReg1 = SrcReg; - if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + if (Register::isPhysicalRegister(SrcReg)) { SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); SubIdx0 = 0; SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); @@ -2761,7 +2827,7 @@ void AArch64InstrInfo::storeRegToStackSlot( case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { Opc = AArch64::STRWui; - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + if (Register::isVirtualRegister(SrcReg)) MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); else assert(SrcReg != AArch64::WSP); @@ -2771,7 +2837,7 @@ void AArch64InstrInfo::storeRegToStackSlot( case 8: if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { Opc = AArch64::STRXui; - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + if (Register::isVirtualRegister(SrcReg)) MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); else assert(SrcReg != AArch64::SP); @@ -2852,7 +2918,7 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, unsigned DestReg0 = DestReg; unsigned DestReg1 = DestReg; bool IsUndef = true; - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) { + if (Register::isPhysicalRegister(DestReg)) { DestReg0 = TRI.getSubReg(DestReg, SubIdx0); SubIdx0 = 0; DestReg1 = TRI.getSubReg(DestReg, SubIdx1); @@ -2892,7 +2958,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRWui; - if (TargetRegisterInfo::isVirtualRegister(DestReg)) + if (Register::isVirtualRegister(DestReg)) MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); else assert(DestReg != AArch64::WSP); @@ -2902,7 +2968,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( case 8: if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRXui; - if (TargetRegisterInfo::isVirtualRegister(DestReg)) + if (Register::isVirtualRegister(DestReg)) MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); else assert(DestReg != AArch64::SP); @@ -2972,21 +3038,39 @@ void AArch64InstrInfo::loadRegFromStackSlot( MI.addMemOperand(MMO); } -void llvm::emitFrameOffset(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - unsigned DestReg, unsigned SrcReg, int Offset, - const TargetInstrInfo *TII, - MachineInstr::MIFlag Flag, bool SetNZCV, - bool NeedsWinCFI, bool *HasWinCFI) { - if (DestReg == SrcReg && Offset == 0) - return; - - assert((DestReg != AArch64::SP || Offset % 16 == 0) && - "SP increment/decrement not 16-byte aligned"); - - bool isSub = Offset < 0; - if (isSub) - Offset = -Offset; +// Helper function to emit a frame offset adjustment from a given +// pointer (SrcReg), stored into DestReg. This function is explicit +// in that it requires the opcode. +static void emitFrameOffsetAdj(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, int64_t Offset, unsigned Opc, + const TargetInstrInfo *TII, + MachineInstr::MIFlag Flag, bool NeedsWinCFI, + bool *HasWinCFI) { + int Sign = 1; + unsigned MaxEncoding, ShiftSize; + switch (Opc) { + case AArch64::ADDXri: + case AArch64::ADDSXri: + case AArch64::SUBXri: + case AArch64::SUBSXri: + MaxEncoding = 0xfff; + ShiftSize = 12; + break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; + default: + llvm_unreachable("Unsupported opcode"); + } // FIXME: If the offset won't fit in 24-bits, compute the offset into a // scratch register. If DestReg is a virtual register, use it as the @@ -2999,65 +3083,94 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, // of code. // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); - unsigned Opc; - if (SetNZCV) - Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri; - else - Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri; - const unsigned MaxEncoding = 0xfff; - const unsigned ShiftSize = 12; const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; - while (((unsigned)Offset) >= (1 << ShiftSize)) { - unsigned ThisVal; - if (((unsigned)Offset) > MaxEncodableValue) { - ThisVal = MaxEncodableValue; - } else { - ThisVal = Offset & MaxEncodableValue; + do { + unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue); + unsigned LocalShiftSize = 0; + if (ThisVal > MaxEncoding) { + ThisVal = ThisVal >> ShiftSize; + LocalShiftSize = ShiftSize; } assert((ThisVal >> ShiftSize) <= MaxEncoding && "Encoding cannot handle value that big"); - BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) - .addReg(SrcReg) - .addImm(ThisVal >> ShiftSize) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize)) - .setMIFlag(Flag); + auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addImm(Sign * (int)ThisVal); + if (ShiftSize) + MBI = MBI.addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); + MBI = MBI.setMIFlag(Flag); - if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) { + if (NeedsWinCFI) { + assert(Sign == 1 && "SEH directives should always have a positive sign"); + int Imm = (int)(ThisVal << LocalShiftSize); + if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || + (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { + if (HasWinCFI) + *HasWinCFI = true; + if (Imm == 0) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); + else + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) + .addImm(Imm) + .setMIFlag(Flag); + assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " + "emit a single SEH directive"); + } else if (DestReg == AArch64::SP) { + if (HasWinCFI) + *HasWinCFI = true; + assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) + .addImm(Imm) + .setMIFlag(Flag); + } if (HasWinCFI) *HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) - .addImm(ThisVal) - .setMIFlag(Flag); } SrcReg = DestReg; - Offset -= ThisVal; - if (Offset == 0) - return; - } - BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) - .addReg(SrcReg) - .addImm(Offset) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) - .setMIFlag(Flag); + Offset -= ThisVal << LocalShiftSize; + } while (Offset); +} - if (NeedsWinCFI) { - if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || - (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { - if (HasWinCFI) - *HasWinCFI = true; - if (Offset == 0) - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)). - setMIFlag(Flag); - else - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)). - addImm(Offset).setMIFlag(Flag); - } else if (DestReg == AArch64::SP) { - if (HasWinCFI) - *HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)). - addImm(Offset).setMIFlag(Flag); +void llvm::emitFrameOffset(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + unsigned DestReg, unsigned SrcReg, + StackOffset Offset, const TargetInstrInfo *TII, + MachineInstr::MIFlag Flag, bool SetNZCV, + bool NeedsWinCFI, bool *HasWinCFI) { + int64_t Bytes, NumPredicateVectors, NumDataVectors; + Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); + + // First emit non-scalable frame offsets, or a simple 'mov'. + if (Bytes || (!Offset && SrcReg != DestReg)) { + assert((DestReg != AArch64::SP || Bytes % 16 == 0) && + "SP increment/decrement not 16-byte aligned"); + unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; + if (Bytes < 0) { + Bytes = -Bytes; + Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; } + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, + NeedsWinCFI, HasWinCFI); + SrcReg = DestReg; + } + + assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && + "WinCFI not supported with SVE vectors"); + + if (NumDataVectors) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + SrcReg = DestReg; + } + + if (NumPredicateVectors) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); } } @@ -3079,15 +3192,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // <rdar://problem/11522048> // if (MI.isFullCopy()) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); - if (SrcReg == AArch64::SP && - TargetRegisterInfo::isVirtualRegister(DstReg)) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); return nullptr; } - if (DstReg == AArch64::SP && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); return nullptr; } @@ -3127,14 +3238,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MachineBasicBlock &MBB = *MI.getParent(); const MachineOperand &DstMO = MI.getOperand(0); const MachineOperand &SrcMO = MI.getOperand(1); - unsigned DstReg = DstMO.getReg(); - unsigned SrcReg = SrcMO.getReg(); + Register DstReg = DstMO.getReg(); + Register SrcReg = SrcMO.getReg(); // This is slightly expensive to compute for physical regs since // getMinimalPhysRegClass is slow. auto getRegClass = [&](unsigned Reg) { - return TargetRegisterInfo::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : TRI.getMinimalPhysRegClass(Reg); + return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) + : TRI.getMinimalPhysRegClass(Reg); }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { @@ -3159,8 +3269,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // STRXui %xzr, %stack.0 // - if (IsSpill && DstMO.isUndef() && - TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { assert(SrcMO.getSubReg() == 0 && "Unexpected subreg on physical register"); const TargetRegisterClass *SpillRC; @@ -3243,10 +3352,23 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( return nullptr; } -int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, +static bool isSVEScaledImmInstruction(unsigned Opcode) { + switch (Opcode) { + case AArch64::LDR_ZXI: + case AArch64::STR_ZXI: + case AArch64::LDR_PXI: + case AArch64::STR_PXI: + return true; + default: + return false; + } +} + +int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, + StackOffset &SOffset, bool *OutUseUnscaledOp, unsigned *OutUnscaledOp, - int *EmittableOffset) { + int64_t *EmittableOffset) { // Set output values in case of early exit. if (EmittableOffset) *EmittableOffset = 0; @@ -3285,6 +3407,10 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); // Construct the complete offset. + bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode()); + int64_t Offset = + IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes()); + const MachineOperand &ImmOpnd = MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); Offset += ImmOpnd.getImm() * Scale; @@ -3304,7 +3430,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, "Cannot have remainder when using unscaled op"); assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); - int NewOffset = Offset / Scale; + int64_t NewOffset = Offset / Scale; if (MinOff <= NewOffset && NewOffset <= MaxOff) Offset = Remainder; else { @@ -3319,27 +3445,33 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, if (OutUnscaledOp && UnscaledOp) *OutUnscaledOp = *UnscaledOp; + if (IsMulVL) + SOffset = StackOffset(Offset, MVT::nxv1i8) + + StackOffset(SOffset.getBytes(), MVT::i8); + else + SOffset = StackOffset(Offset, MVT::i8) + + StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8); return AArch64FrameOffsetCanUpdate | - (Offset == 0 ? AArch64FrameOffsetIsLegal : 0); + (SOffset ? 0 : AArch64FrameOffsetIsLegal); } bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII) { unsigned Opcode = MI.getOpcode(); unsigned ImmIdx = FrameRegIdx + 1; if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { - Offset += MI.getOperand(ImmIdx).getImm(); + Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8); emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), MI.getOperand(0).getReg(), FrameReg, Offset, TII, MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); MI.eraseFromParent(); - Offset = 0; + Offset = StackOffset(); return true; } - int NewOffset; + int64_t NewOffset; unsigned UnscaledOp; bool UseUnscaledOp; int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, @@ -3352,7 +3484,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MI.setDesc(TII->get(UnscaledOp)); MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); - return Offset == 0; + return !Offset; } return false; @@ -3428,13 +3560,19 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { switch (Inst.getOpcode()) { default: break; + case AArch64::FADDHrr: case AArch64::FADDSrr: case AArch64::FADDDrr: + case AArch64::FADDv4f16: + case AArch64::FADDv8f16: case AArch64::FADDv2f32: case AArch64::FADDv2f64: case AArch64::FADDv4f32: + case AArch64::FSUBHrr: case AArch64::FSUBSrr: case AArch64::FSUBDrr: + case AArch64::FSUBv4f16: + case AArch64::FSUBv8f16: case AArch64::FSUBv2f32: case AArch64::FSUBv2f64: case AArch64::FSUBv4f32: @@ -3459,7 +3597,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) @@ -3544,86 +3682,48 @@ static bool getMaddPatterns(MachineInstr &Root, Opc = NewOpc; } + auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, + MachineCombinerPattern Pattern) { + if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { + Patterns.push_back(Pattern); + Found = true; + } + }; + + typedef MachineCombinerPattern MCP; + switch (Opc) { default: break; case AArch64::ADDWrr: assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && "ADDWrr does not have register operands"); - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); + setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); break; case AArch64::ADDXrr: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); + setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); break; case AArch64::SUBWrr: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); + setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); break; case AArch64::SUBXrr: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); + setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); break; case AArch64::ADDWri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); break; case AArch64::ADDXri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); break; case AArch64::SUBWri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); break; case AArch64::SUBXri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); break; } return Found; @@ -3640,204 +3740,135 @@ static bool getFMAPatterns(MachineInstr &Root, MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; + auto Match = [&](int Opcode, int Operand, + MachineCombinerPattern Pattern) -> bool { + if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { + Patterns.push_back(Pattern); + return true; + } + return false; + }; + + typedef MachineCombinerPattern MCP; + switch (Root.getOpcode()) { default: assert(false && "Unsupported FP instruction in combiner\n"); break; + case AArch64::FADDHrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "FADDHrr does not have register operands"); + + Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); + Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); + break; case AArch64::FADDSrr: assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && - "FADDWrr does not have register operands"); - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); - Found = true; - } + "FADDSrr does not have register operands"); + + Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || + Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); + + Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || + Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); break; case AArch64::FADDDrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); - Found = true; - } + Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || + Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); + + Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || + Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); + break; + case AArch64::FADDv4f16: + Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || + Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); + + Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || + Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); + break; + case AArch64::FADDv8f16: + Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || + Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); + + Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || + Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); break; case AArch64::FADDv2f32: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); - Found = true; - } + Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || + Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); + + Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || + Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); break; case AArch64::FADDv2f64: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); - Found = true; - } + Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || + Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); + + Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || + Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); break; case AArch64::FADDv4f32: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); - Found = true; - } - break; + Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || + Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); + Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || + Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); + break; + case AArch64::FSUBHrr: + Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); + Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); + Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); + break; case AArch64::FSUBSrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1); - Found = true; - } + Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); + + Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || + Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); + + Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); break; case AArch64::FSUBDrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1); - Found = true; - } + Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); + + Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || + Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); + + Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); + break; + case AArch64::FSUBv4f16: + Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || + Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); + + Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || + Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); + break; + case AArch64::FSUBv8f16: + Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || + Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); + + Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || + Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); break; case AArch64::FSUBv2f32: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1); - Found = true; - } + Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || + Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); + + Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || + Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); break; case AArch64::FSUBv2f64: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1); - Found = true; - } + Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || + Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); + + Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || + Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); break; case AArch64::FSUBv4f32: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1); - Found = true; - } + Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || + Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); + + Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || + Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); break; } return Found; @@ -3851,6 +3882,10 @@ bool AArch64InstrInfo::isThroughputPattern( switch (Pattern) { default: break; + case MachineCombinerPattern::FMULADDH_OP1: + case MachineCombinerPattern::FMULADDH_OP2: + case MachineCombinerPattern::FMULSUBH_OP1: + case MachineCombinerPattern::FMULSUBH_OP2: case MachineCombinerPattern::FMULADDS_OP1: case MachineCombinerPattern::FMULADDS_OP2: case MachineCombinerPattern::FMULSUBS_OP1: @@ -3859,12 +3894,21 @@ bool AArch64InstrInfo::isThroughputPattern( case MachineCombinerPattern::FMULADDD_OP2: case MachineCombinerPattern::FMULSUBD_OP1: case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FNMULSUBH_OP1: case MachineCombinerPattern::FNMULSUBS_OP1: case MachineCombinerPattern::FNMULSUBD_OP1: + case MachineCombinerPattern::FMLAv4i16_indexed_OP1: + case MachineCombinerPattern::FMLAv4i16_indexed_OP2: + case MachineCombinerPattern::FMLAv8i16_indexed_OP1: + case MachineCombinerPattern::FMLAv8i16_indexed_OP2: case MachineCombinerPattern::FMLAv1i32_indexed_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP2: case MachineCombinerPattern::FMLAv1i64_indexed_OP1: case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + case MachineCombinerPattern::FMLAv4f16_OP2: + case MachineCombinerPattern::FMLAv4f16_OP1: + case MachineCombinerPattern::FMLAv8f16_OP1: + case MachineCombinerPattern::FMLAv8f16_OP2: case MachineCombinerPattern::FMLAv2f32_OP2: case MachineCombinerPattern::FMLAv2f32_OP1: case MachineCombinerPattern::FMLAv2f64_OP1: @@ -3877,10 +3921,18 @@ bool AArch64InstrInfo::isThroughputPattern( case MachineCombinerPattern::FMLAv4f32_OP2: case MachineCombinerPattern::FMLAv4i32_indexed_OP1: case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv4i16_indexed_OP1: + case MachineCombinerPattern::FMLSv4i16_indexed_OP2: + case MachineCombinerPattern::FMLSv8i16_indexed_OP1: + case MachineCombinerPattern::FMLSv8i16_indexed_OP2: case MachineCombinerPattern::FMLSv1i32_indexed_OP2: case MachineCombinerPattern::FMLSv1i64_indexed_OP2: case MachineCombinerPattern::FMLSv2i32_indexed_OP2: case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + case MachineCombinerPattern::FMLSv4f16_OP1: + case MachineCombinerPattern::FMLSv4f16_OP2: + case MachineCombinerPattern::FMLSv8f16_OP1: + case MachineCombinerPattern::FMLSv8f16_OP2: case MachineCombinerPattern::FMLSv2f32_OP2: case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv4i32_indexed_OP2: @@ -3933,15 +3985,15 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind = FMAInstKind::Default, - const unsigned *ReplacedAddend = nullptr) { + const Register *ReplacedAddend = nullptr) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); - unsigned ResultReg = Root.getOperand(0).getReg(); - unsigned SrcReg0 = MUL->getOperand(1).getReg(); + Register ResultReg = Root.getOperand(0).getReg(); + Register SrcReg0 = MUL->getOperand(1).getReg(); bool Src0IsKill = MUL->getOperand(1).isKill(); - unsigned SrcReg1 = MUL->getOperand(2).getReg(); + Register SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); unsigned SrcReg2; @@ -3955,13 +4007,13 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); } - if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + if (Register::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + if (Register::isVirtualRegister(SrcReg0)) MRI.constrainRegClass(SrcReg0, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + if (Register::isVirtualRegister(SrcReg1)) MRI.constrainRegClass(SrcReg1, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) + if (Register::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); MachineInstrBuilder MIB; @@ -4015,19 +4067,19 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, assert(IdxMulOpd == 1 || IdxMulOpd == 2); MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); - unsigned ResultReg = Root.getOperand(0).getReg(); - unsigned SrcReg0 = MUL->getOperand(1).getReg(); + Register ResultReg = Root.getOperand(0).getReg(); + Register SrcReg0 = MUL->getOperand(1).getReg(); bool Src0IsKill = MUL->getOperand(1).isKill(); - unsigned SrcReg1 = MUL->getOperand(2).getReg(); + Register SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); - if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + if (Register::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + if (Register::isVirtualRegister(SrcReg0)) MRI.constrainRegClass(SrcReg0, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + if (Register::isVirtualRegister(SrcReg1)) MRI.constrainRegClass(SrcReg1, RC); - if (TargetRegisterInfo::isVirtualRegister(VR)) + if (Register::isVirtualRegister(VR)) MRI.constrainRegClass(VR, RC); MachineInstrBuilder MIB = @@ -4116,7 +4168,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - unsigned NewVR = MRI.createVirtualRegister(OrrRC); + Register NewVR = MRI.createVirtualRegister(OrrRC); uint64_t Imm = Root.getOperand(2).getImm(); if (Root.getOperand(3).isImm()) { @@ -4158,7 +4210,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - unsigned NewVR = MRI.createVirtualRegister(SubRC); + Register NewVR = MRI.createVirtualRegister(SubRC); // SUB NewVR, 0, C MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) @@ -4208,7 +4260,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - unsigned NewVR = MRI.createVirtualRegister(OrrRC); + Register NewVR = MRI.createVirtualRegister(OrrRC); uint64_t Imm = Root.getOperand(2).getImm(); if (Root.getOperand(3).isImm()) { unsigned Val = Root.getOperand(3).getImm(); @@ -4228,34 +4280,35 @@ void AArch64InstrInfo::genAlternativeCodeSequence( break; } // Floating Point Support + case MachineCombinerPattern::FMULADDH_OP1: + Opc = AArch64::FMADDHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FMULADDS_OP1: + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FMULADDD_OP1: - // MUL I=A,B,0 - // ADD R,I,C - // ==> MADD R,A,B,C - // --- Create(MADD); - if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { - Opc = AArch64::FMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMADDDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; + + case MachineCombinerPattern::FMULADDH_OP2: + Opc = AArch64::FMADDHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; case MachineCombinerPattern::FMULADDS_OP2: + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; case MachineCombinerPattern::FMULADDD_OP2: - // FMUL I=A,B,0 - // FADD R,C,I - // ==> FMADD R,A,B,C - // --- Create(FMADD); - if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { - Opc = AArch64::FMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMADDDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; @@ -4285,6 +4338,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Indexed); break; + case MachineCombinerPattern::FMLAv4i16_indexed_OP1: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv4f16_OP1: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv4i16_indexed_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv4f16_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: case MachineCombinerPattern::FMLAv2f32_OP1: RC = &AArch64::FPR64RegClass; @@ -4312,6 +4390,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; + case MachineCombinerPattern::FMLAv8i16_indexed_OP1: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv8f16_OP1: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv8i16_indexed_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv8f16_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: case MachineCombinerPattern::FMLAv2f64_OP1: RC = &AArch64::FPR128RegClass; @@ -4367,56 +4470,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; + case MachineCombinerPattern::FMULSUBH_OP1: + Opc = AArch64::FNMSUBHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FMULSUBS_OP1: - case MachineCombinerPattern::FMULSUBD_OP1: { - // FMUL I=A,B,0 - // FSUB R,I,C - // ==> FNMSUB R,A,B,C // = -C + A*B - // --- Create(FNMSUB); - if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { - Opc = AArch64::FNMSUBSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FNMSUBDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FNMSUBSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FMULSUBD_OP1: + Opc = AArch64::FNMSUBDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - } + case MachineCombinerPattern::FNMULSUBH_OP1: + Opc = AArch64::FNMADDHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FNMULSUBS_OP1: - case MachineCombinerPattern::FNMULSUBD_OP1: { - // FNMUL I=A,B,0 - // FSUB R,I,C - // ==> FNMADD R,A,B,C // = -A*B - C - // --- Create(FNMADD); - if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) { - Opc = AArch64::FNMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FNMADDDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FNMADDSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FNMULSUBD_OP1: + Opc = AArch64::FNMADDDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - } + case MachineCombinerPattern::FMULSUBH_OP2: + Opc = AArch64::FMSUBHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; case MachineCombinerPattern::FMULSUBS_OP2: - case MachineCombinerPattern::FMULSUBD_OP2: { - // FMUL I=A,B,0 - // FSUB R,C,I - // ==> FMSUB R,A,B,C (computes C - A*B) - // --- Create(FMSUB); - if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { - Opc = AArch64::FMSUBSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMSUBDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FMSUBSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::FMULSUBD_OP2: + Opc = AArch64::FMSUBDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - } case MachineCombinerPattern::FMLSv1i32_indexed_OP2: Opc = AArch64::FMLSv1i32_indexed; @@ -4432,6 +4532,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Indexed); break; + case MachineCombinerPattern::FMLSv4f16_OP1: + case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { + RC = &AArch64::FPR64RegClass; + Register NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { + Opc = AArch64::FMLAv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } else { + Opc = AArch64::FMLAv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv4f16_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLSv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLSv4i16_indexed_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLSv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLSv2f32_OP2: case MachineCombinerPattern::FMLSv2i32_indexed_OP2: RC = &AArch64::FPR64RegClass; @@ -4446,6 +4579,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; + case MachineCombinerPattern::FMLSv8f16_OP1: + case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { + RC = &AArch64::FPR128RegClass; + Register NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { + Opc = AArch64::FMLAv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } else { + Opc = AArch64::FMLAv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv8f16_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLSv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLSv8i16_indexed_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLSv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv2i64_indexed_OP2: RC = &AArch64::FPR128RegClass; @@ -4476,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::FMLSv2f32_OP1: case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { RC = &AArch64::FPR64RegClass; - unsigned NewVR = MRI.createVirtualRegister(RC); + Register NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) .add(Root.getOperand(2)); @@ -4496,7 +4662,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::FMLSv4f32_OP1: case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { RC = &AArch64::FPR128RegClass; - unsigned NewVR = MRI.createVirtualRegister(RC); + Register NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) .add(Root.getOperand(2)); @@ -4516,7 +4682,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::FMLSv2f64_OP1: case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { RC = &AArch64::FPR128RegClass; - unsigned NewVR = MRI.createVirtualRegister(RC); + Register NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) .add(Root.getOperand(2)); @@ -4617,15 +4783,15 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - unsigned VReg = MI.getOperand(0).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(VReg)) + Register VReg = MI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(VReg)) return false; MachineInstr *DefMI = MRI->getVRegDef(VReg); // Look through COPY instructions to find definition. while (DefMI->isCopy()) { - unsigned CopyVReg = DefMI->getOperand(1).getReg(); + Register CopyVReg = DefMI->getOperand(1).getReg(); if (!MRI->hasOneNonDBGUse(CopyVReg)) return false; if (!MRI->hasOneDef(CopyVReg)) @@ -4653,8 +4819,8 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { return false; MachineOperand &MO = DefMI->getOperand(1); - unsigned NewReg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(NewReg)) + Register NewReg = MO.getReg(); + if (!Register::isVirtualRegister(NewReg)) return false; assert(!MRI->def_empty(NewReg) && "Register must be defined."); @@ -4737,9 +4903,13 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { static const std::pair<unsigned, const char *> TargetFlags[] = { {MO_COFFSTUB, "aarch64-coffstub"}, - {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, - {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"}, - {MO_DLLIMPORT, "aarch64-dllimport"}}; + {MO_GOT, "aarch64-got"}, + {MO_NC, "aarch64-nc"}, + {MO_S, "aarch64-s"}, + {MO_TLS, "aarch64-tls"}, + {MO_DLLIMPORT, "aarch64-dllimport"}, + {MO_PREL, "aarch64-prel"}, + {MO_TAGGED, "aarch64-tagged"}}; return makeArrayRef(TargetFlags); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 7be4daba7dc4..1688045e4fb8 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -15,6 +15,7 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" +#include "AArch64StackOffset.h" #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -55,8 +56,7 @@ public: bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -299,7 +299,7 @@ private: /// if necessary, to be replaced by the scavenger at the end of PEI. void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, - int Offset, const TargetInstrInfo *TII, + StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, bool SetNZCV = false, bool NeedsWinCFI = false, bool *HasWinCFI = nullptr); @@ -308,7 +308,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, /// FP. Return false if the offset could not be handled directly in MI, and /// return the left-over portion by reference. bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII); /// Use to report the frame offset status in isAArch64FrameOffsetLegal. @@ -332,10 +332,10 @@ enum AArch64FrameOffsetStatus { /// If set, @p EmittableOffset contains the amount that can be set in @p MI /// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that /// is a legal offset. -int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, +int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp = nullptr, unsigned *OutUnscaledOp = nullptr, - int *EmittableOffset = nullptr); + int64_t *EmittableOffset = nullptr); static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; } diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index eed53f36d574..1981bd5d3bf0 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -62,6 +62,9 @@ def HasAM : Predicate<"Subtarget->hasAM()">, def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, AssemblerPredicate<"FeatureSEL2", "sel2">; +def HasPMU : Predicate<"Subtarget->hasPMU()">, + AssemblerPredicate<"FeaturePMU", "pmu">; + def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">; @@ -116,7 +119,7 @@ def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">; def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, - AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">; + AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, AssemblerPredicate<"FeatureRCPC", "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, @@ -133,6 +136,12 @@ def HasBTI : Predicate<"Subtarget->hasBTI()">, AssemblerPredicate<"FeatureBranchTargetId", "bti">; def HasMTE : Predicate<"Subtarget->hasMTE()">, AssemblerPredicate<"FeatureMTE", "mte">; +def HasTME : Predicate<"Subtarget->hasTME()">, + AssemblerPredicate<"FeatureTME", "tme">; +def HasETE : Predicate<"Subtarget->hasETE()">, + AssemblerPredicate<"FeatureETE", "ete">; +def HasTRBE : Predicate<"Subtarget->hasTRBE()">, + AssemblerPredicate<"FeatureTRBE", "trbe">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -415,6 +424,14 @@ def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, S def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def SDT_AArch64unpk : SDTypeProfile<1, 1, [ + SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0> +]>; +def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>; +def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>; +def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>; +def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -431,6 +448,13 @@ let RecomputePerFunction = 1 in { def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; + + // Toggles patterns which aren't beneficial in GlobalISel when we aren't + // optimizing. This allows us to selectively use patterns without impacting + // SelectionDAG's behaviour. + // FIXME: One day there will probably be a nicer way to check for this, but + // today is not that day. + def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">; } include "AArch64InstrFormats.td" @@ -785,7 +809,11 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in { def HWASAN_CHECK_MEMACCESS : Pseudo< (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), - [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>, + [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, + Sched<[]>; +def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo< + (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), + [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, Sched<[]>; } @@ -804,6 +832,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2", (SYSxt imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, XZR)>; + +let Predicates = [HasTME] in { + +def TSTART : TMSystemI<0b0000, "tstart", + [(set GPR64:$Rt, (int_aarch64_tstart))]>; + +def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>; + +def TCANCEL : TMSystemException<0b011, "tcancel", + [(int_aarch64_tcancel i64_imm0_65535:$imm)]>; + +def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> { + let mayLoad = 0; + let mayStore = 0; +} +} // HasTME + //===----------------------------------------------------------------------===// // Move immediate instructions. //===----------------------------------------------------------------------===// @@ -815,37 +860,37 @@ let PostEncoderMethod = "fixMOVZ" in defm MOVZ : MoveImmediate<0b10, "movz">; // First group of aliases covers an implicit "lsl #0". -def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>; -def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>; -def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>; +def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>; // Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax. -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>; // Final group of aliases covers true "mov $Rd, $imm" cases. multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR, @@ -917,8 +962,12 @@ def trunc_imm : SDNodeXForm<imm, [{ def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">, GISDNodeXFormEquiv<trunc_imm>; +let Predicates = [OptimizedGISelOrOtherSelector] in { +// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless +// copies. def : Pat<(i64 i64imm_32bit:$src), (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; +} // Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model). def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ @@ -1012,10 +1061,10 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; let AddedComplexity = 1 in { -def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3), - (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>; -def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3), - (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>; +def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3), + (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>; +def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3), + (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>; } // Because of the immediate format for add/sub-imm instructions, the @@ -2165,8 +2214,8 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{ if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) { const DataLayout &DL = MF->getDataLayout(); - unsigned Align = G->getGlobal()->getPointerAlignment(DL); - return Align >= 4 && G->getOffset() % 4 == 0; + MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL); + return Align && *Align >= 4 && G->getOffset() % 4 == 0; } if (auto *C = dyn_cast<ConstantPoolSDNode>(N)) return C->getAlignment() >= 4 && C->getOffset() % 4 == 0; @@ -3281,20 +3330,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", // N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike // the NEON variant. + +// Here we handle first -(a + b*c) for FNMADD: + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)), + (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)), (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)), (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; -// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and -// "(-a) + b*(-c)". +// Now it's time for "(-a) + (-b)*c" + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))), + (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))), (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))), (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +// And here "(-a) + b*(-c)" + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))), + (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))), (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; @@ -6939,5 +7005,124 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>; def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>; +// Extracting lane zero is a special case where we can just use a plain +// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the +// rest of the compiler, especially the register allocator and copy propagation, +// to reason about, so is preferred when it's possible to use it. +let AddedComplexity = 10 in { + def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>; + def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>; + def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>; +} + +// dot_v4i8 +class mul_v4i8<SDPatternOperator ldop> : + PatFrag<(ops node:$Rn, node:$Rm, node:$offset), + (mul (ldop (add node:$Rn, node:$offset)), + (ldop (add node:$Rm, node:$offset)))>; +class mulz_v4i8<SDPatternOperator ldop> : + PatFrag<(ops node:$Rn, node:$Rm), + (mul (ldop node:$Rn), (ldop node:$Rm))>; + +def load_v4i8 : + OutPatFrag<(ops node:$R), + (INSERT_SUBREG + (v2i32 (IMPLICIT_DEF)), + (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)), + ssub)>; + +class dot_v4i8<Instruction DOT, SDPatternOperator ldop> : + Pat<(i32 (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)), + (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)), + (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)), + (mulz_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm))))), + (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR), + (load_v4i8 GPR64sp:$Rn), + (load_v4i8 GPR64sp:$Rm))), + sub_32)>, Requires<[HasDotProd]>; + +// dot_v8i8 +class ee_v8i8<SDPatternOperator extend> : + PatFrag<(ops node:$V, node:$K), + (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>; + +class mul_v8i8<SDPatternOperator mulop, SDPatternOperator extend> : + PatFrag<(ops node:$M, node:$N, node:$K), + (mulop (v4i16 (ee_v8i8<extend> node:$M, node:$K)), + (v4i16 (ee_v8i8<extend> node:$N, node:$K)))>; + +class idot_v8i8<SDPatternOperator mulop, SDPatternOperator extend> : + PatFrag<(ops node:$M, node:$N), + (i32 (extractelt + (v4i32 (AArch64uaddv + (add (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 0)), + (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 4))))), + (i64 0)))>; + +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>; + +class odot_v8i8<Instruction DOT> : + OutPatFrag<(ops node:$Vm, node:$Vn), + (EXTRACT_SUBREG + (VADDV_32 + (i64 (DOT (DUPv2i32gpr WZR), + (v8i8 node:$Vm), + (v8i8 node:$Vn)))), + sub_32)>; + +class dot_v8i8<Instruction DOT, SDPatternOperator mulop, + SDPatternOperator extend> : + Pat<(idot_v8i8<mulop, extend> V64:$Vm, V64:$Vn), + (odot_v8i8<DOT> V64:$Vm, V64:$Vn)>, + Requires<[HasDotProd]>; + +// dot_v16i8 +class ee_v16i8<SDPatternOperator extend> : + PatFrag<(ops node:$V, node:$K1, node:$K2), + (v4i16 (extract_subvector + (v8i16 (extend + (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>; + +class mul_v16i8<SDPatternOperator mulop, SDPatternOperator extend> : + PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2), + (v4i32 + (mulop (v4i16 (ee_v16i8<extend> node:$M, node:$K1, node:$K2)), + (v4i16 (ee_v16i8<extend> node:$N, node:$K1, node:$K2))))>; + +class idot_v16i8<SDPatternOperator m, SDPatternOperator x> : + PatFrag<(ops node:$M, node:$N), + (i32 (extractelt + (v4i32 (AArch64uaddv + (add + (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 0)), + (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 0))), + (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 4)), + (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 4)))))), + (i64 0)))>; + +class odot_v16i8<Instruction DOT> : + OutPatFrag<(ops node:$Vm, node:$Vn), + (i32 (ADDVv4i32v + (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>; + +class dot_v16i8<Instruction DOT, SDPatternOperator mulop, + SDPatternOperator extend> : + Pat<(idot_v16i8<mulop, extend> V128:$Vm, V128:$Vn), + (odot_v16i8<DOT> V128:$Vm, V128:$Vn)>, + Requires<[HasDotProd]>; + +let AddedComplexity = 10 in { + def : dot_v4i8<SDOTv8i8, sextloadi8>; + def : dot_v4i8<UDOTv8i8, zextloadi8>; + def : dot_v8i8<SDOTv8i8, AArch64smull, sext>; + def : dot_v8i8<UDOTv8i8, AArch64umull, zext>; + def : dot_v16i8<SDOTv16i8, AArch64smull, sext>; + def : dot_v16i8<UDOTv16i8, AArch64umull, zext>; + + // FIXME: add patterns to generate vector by element dot product. + // FIXME: add SVE dot-product patterns. +} + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 4e13fb8e2027..961f38cad1e4 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -51,9 +51,19 @@ public: const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } + void setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) override { + InstructionSelector::setupMF(MF, KB, CoverageInfo); + + // hasFnAttribute() is expensive to call on every BRCOND selection, so + // cache it here for each run of the selector. + ProduceNonFlagSettingCondBr = + !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + } + private: /// tblgen-erated 'select' implementation, used as the initial selector for /// the patterns that don't require complex C++. @@ -68,6 +78,10 @@ private: bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + /// Eliminate same-sized cross-bank copies into stores before selectImpl(). + void contractCrossBankCopyIntoStore(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, @@ -101,8 +115,6 @@ private: bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; - void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI, - SmallVectorImpl<Optional<int>> &Idxs) const; bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; @@ -116,6 +128,7 @@ private: bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const; MachineInstr *emitLoadFromConstantPool(Constant *CPVal, @@ -128,6 +141,8 @@ private: MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitTST(const Register &LHS, const Register &RHS, @@ -155,7 +170,9 @@ private: ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; + ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; ComplexRendererFns selectArithImmed(MachineOperand &Root) const; + ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, unsigned Size) const; @@ -183,11 +200,48 @@ private: return selectAddrModeIndexed(Root, Width / 8); } + bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + ComplexRendererFns + selectAddrModeShiftedExtendXReg(MachineOperand &Root, + unsigned SizeInBytes) const; + ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template <int Width> + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { + return selectAddrModeXRO(Root, Width / 8); + } + + ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const; + + ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { + return selectShiftedRegister(Root); + } + + ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { + // TODO: selectShiftedRegister should allow for rotates on logical shifts. + // For now, make them the same. The only difference between the two is that + // logical shifts are allowed to fold in rotates. Otherwise, these are + // functionally the same. + return selectShiftedRegister(Root); + } + + /// Instructions that accept extend modifiers like UXTW expect the register + /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a + /// subregister copy if necessary. Return either ExtReg, or the result of the + /// new copy. + Register narrowExtendRegIfNeeded(Register ExtReg, + MachineIRBuilder &MIB) const; + ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; + void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const; + void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const; + void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const; // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. void materializeLargeCMVal(MachineInstr &I, const Value *V, - unsigned char OpFlags) const; + unsigned OpFlags) const; // Optimization methods. bool tryOptVectorShuffle(MachineInstr &I) const; @@ -197,12 +251,22 @@ private: MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; + /// Return true if \p MI is a load or store of \p NumBytes bytes. + bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; + + /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit + /// register zeroed out. In other words, the result of MI has been explicitly + /// zero extended. + bool isDef32(const MachineInstr &MI) const; + const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; const AArch64RegisterInfo &TRI; const AArch64RegisterBankInfo &RBI; + bool ProduceNonFlagSettingCondBr = false; + #define GET_GLOBALISEL_PREDICATES_DECL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -312,7 +376,7 @@ static bool getSubRegForClass(const TargetRegisterClass *RC, SubReg = AArch64::hsub; break; case 32: - if (RC == &AArch64::GPR32RegClass) + if (RC != &AArch64::FPR32RegClass) SubReg = AArch64::sub_32; else SubReg = AArch64::ssub; @@ -357,7 +421,7 @@ static bool unsupportedBinOp(const MachineInstr &I, // so, this will need to be taught about that, and we'll need to get the // bank out of the minimal class for the register. // Either way, this needs to be documented (and possibly verified). - if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + if (!Register::isVirtualRegister(MO.getReg())) { LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); return true; } @@ -492,8 +556,8 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); @@ -502,7 +566,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, (DstSize == SrcSize || // Copies are a mean to setup initial types, the number of // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || + (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || // Copies are a mean to copy bits around, as long as we are // on the same register class, that's fine. Otherwise, that // means we need some SUBREG_TO_REG or AND & co. @@ -526,7 +590,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, /// SubRegCopy (To class) = COPY CopyReg:SubReg /// Dst = COPY SubRegCopy static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, - const RegisterBankInfo &RBI, unsigned SrcReg, + const RegisterBankInfo &RBI, Register SrcReg, const TargetRegisterClass *From, const TargetRegisterClass *To, unsigned SubReg) { @@ -539,7 +603,7 @@ static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, // It's possible that the destination register won't be constrained. Make // sure that happens. - if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg())) + if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); return true; @@ -553,8 +617,8 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); @@ -579,8 +643,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); @@ -607,11 +671,10 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, // result. auto CheckCopy = [&]() { // If we have a bitcast or something, we can't have physical registers. - assert( - (I.isCopy() || - (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) && - !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) && - "No phys reg on generic operator!"); + assert((I.isCopy() || + (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && + !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && + "No phys reg on generic operator!"); assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI)); (void)KnownValid; return true; @@ -626,38 +689,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return false; } - // Is this a cross-bank copy? - if (DstRegBank.getID() != SrcRegBank.getID()) { - // If we're doing a cross-bank copy on different-sized registers, we need - // to do a bit more work. - unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); - unsigned DstSize = TRI.getRegSizeInBits(*DstRC); - - if (SrcSize > DstSize) { - // We're doing a cross-bank copy into a smaller register. We need a - // subregister copy. First, get a register class that's on the same bank - // as the destination, but the same size as the source. - const TargetRegisterClass *SubregRC = - getMinClassForRegBank(DstRegBank, SrcSize, true); - assert(SubregRC && "Didn't get a register class for subreg?"); + unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); + unsigned DstSize = TRI.getRegSizeInBits(*DstRC); - // Get the appropriate subregister for the destination. - unsigned SubReg = 0; - if (!getSubRegForClass(DstRC, TRI, SubReg)) { - LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); - return false; - } + // If we're doing a cross-bank copy on different-sized registers, we need + // to do a bit more work. + if (SrcSize > DstSize) { + // We're doing a cross-bank copy into a smaller register. We need a + // subregister copy. First, get a register class that's on the same bank + // as the destination, but the same size as the source. + const TargetRegisterClass *SubregRC = + getMinClassForRegBank(DstRegBank, SrcSize, true); + assert(SubregRC && "Didn't get a register class for subreg?"); - // Now, insert a subregister copy using the new register class. - selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); - return CheckCopy(); + // Get the appropriate subregister for the destination. + unsigned SubReg = 0; + if (!getSubRegForClass(DstRC, TRI, SubReg)) { + LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); + return false; } - else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && - SrcSize == 16) { + // Now, insert a subregister copy using the new register class. + selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); + return CheckCopy(); + } + + // Is this a cross-bank copy? + if (DstRegBank.getID() != SrcRegBank.getID()) { + if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && + SrcSize == 16) { // Special case for FPR16 to GPR32. // FIXME: This can probably be generalized like the above case. - unsigned PromoteReg = + Register PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG), PromoteReg) @@ -674,7 +737,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, // If the destination is a physical register, then there's nothing to // change, so we're done. - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + if (Register::isPhysicalRegister(DstReg)) return CheckCopy(); } @@ -955,7 +1018,9 @@ bool AArch64InstructionSelector::selectVectorSHL( return false; unsigned Opc = 0; - if (Ty == LLT::vector(4, 32)) { + if (Ty == LLT::vector(2, 64)) { + Opc = AArch64::USHLv2i64; + } else if (Ty == LLT::vector(4, 32)) { Opc = AArch64::USHLv4i32; } else if (Ty == LLT::vector(2, 32)) { Opc = AArch64::USHLv2i32; @@ -989,7 +1054,11 @@ bool AArch64InstructionSelector::selectVectorASHR( unsigned Opc = 0; unsigned NegOpc = 0; const TargetRegisterClass *RC = nullptr; - if (Ty == LLT::vector(4, 32)) { + if (Ty == LLT::vector(2, 64)) { + Opc = AArch64::SSHLv2i64; + NegOpc = AArch64::NEGv2i64; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(4, 32)) { Opc = AArch64::SSHLv4i32; NegOpc = AArch64::NEGv4i32; RC = &AArch64::FPR128RegClass; @@ -1044,7 +1113,7 @@ bool AArch64InstructionSelector::selectVaStartDarwin( } void AArch64InstructionSelector::materializeLargeCMVal( - MachineInstr &I, const Value *V, unsigned char OpFlags) const { + MachineInstr &I, const Value *V, unsigned OpFlags) const { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1097,8 +1166,8 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { // some reason we receive input GMIR that has an s64 shift amount that's not // a G_CONSTANT, insert a truncate so that we can still select the s32 // register-register variant. - unsigned SrcReg = I.getOperand(1).getReg(); - unsigned ShiftReg = I.getOperand(2).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + Register ShiftReg = I.getOperand(2).getReg(); const LLT ShiftTy = MRI.getType(ShiftReg); const LLT SrcTy = MRI.getType(SrcReg); if (SrcTy.isVector()) @@ -1118,6 +1187,9 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { } return; } + case TargetOpcode::G_STORE: + contractCrossBankCopyIntoStore(I, MRI); + return; default: return; } @@ -1158,6 +1230,48 @@ bool AArch64InstructionSelector::earlySelectSHL( return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); } +void AArch64InstructionSelector::contractCrossBankCopyIntoStore( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); + // If we're storing a scalar, it doesn't matter what register bank that + // scalar is on. All that matters is the size. + // + // So, if we see something like this (with a 32-bit scalar as an example): + // + // %x:gpr(s32) = ... something ... + // %y:fpr(s32) = COPY %x:gpr(s32) + // G_STORE %y:fpr(s32) + // + // We can fix this up into something like this: + // + // G_STORE %x:gpr(s32) + // + // And then continue the selection process normally. + MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI); + if (!Def) + return; + Register DefDstReg = Def->getOperand(0).getReg(); + LLT DefDstTy = MRI.getType(DefDstReg); + Register StoreSrcReg = I.getOperand(0).getReg(); + LLT StoreSrcTy = MRI.getType(StoreSrcReg); + + // If we get something strange like a physical register, then we shouldn't + // go any further. + if (!DefDstTy.isValid()) + return; + + // Are the source and dst types the same size? + if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) + return; + + if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == + RBI.getRegBank(DefDstReg, MRI, TRI)) + return; + + // We have a cross-bank copy, which is entering a store. Let's fold it. + I.getOperand(0).setReg(DefDstReg); +} + bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1169,13 +1283,37 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { switch (I.getOpcode()) { case TargetOpcode::G_SHL: return earlySelectSHL(I, MRI); + case TargetOpcode::G_CONSTANT: { + bool IsZero = false; + if (I.getOperand(1).isCImm()) + IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; + else if (I.getOperand(1).isImm()) + IsZero = I.getOperand(1).getImm() == 0; + + if (!IsZero) + return false; + + Register DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32)) + return false; + + if (Ty == LLT::scalar(64)) { + I.getOperand(1).ChangeToRegister(AArch64::XZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); + } else { + I.getOperand(1).ChangeToRegister(AArch64::WZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); + } + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } default: return false; } } -bool AArch64InstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool AArch64InstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1244,7 +1382,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, if (earlySelect(I)) return true; - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; LLT Ty = @@ -1439,14 +1577,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return true; } case TargetOpcode::G_EXTRACT: { - LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); - LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI.getType(DstReg); (void)DstTy; unsigned SrcSize = SrcTy.getSizeInBits(); - // Larger extracts are vectors, same-size extracts should be something else - // by now (either split up or simplified to a COPY). - if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32) - return false; + + if (SrcTy.getSizeInBits() > 64) { + // This should be an extract of an s128, which is like a vector extract. + if (SrcTy.getSizeInBits() != 128) + return false; + // Only support extracting 64 bits from an s128 at the moment. + if (DstTy.getSizeInBits() != 64) + return false; + + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + // Check we have the right regbank always. + assert(SrcRB.getID() == AArch64::FPRRegBankID && + DstRB.getID() == AArch64::FPRRegBankID && + "Wrong extract regbank!"); + (void)SrcRB; + + // Emit the same code as a vector extract. + // Offset must be a multiple of 64. + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 64 != 0) + return false; + unsigned LaneIdx = Offset / 64; + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + @@ -1458,7 +1625,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) .addReg(DstReg, 0, AArch64::sub_32); @@ -1521,11 +1688,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_GLOBAL_VALUE: { auto GV = I.getOperand(1).getGlobal(); - if (GV->isThreadLocal()) { - // FIXME: we don't support TLS yet. - return false; - } - unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM); + if (GV->isThreadLocal()) + return selectTLSGlobalValue(I, MRI); + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); if (OpFlags & AArch64II::MO_GOT) { I.setDesc(TII.get(AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); @@ -1562,8 +1728,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } auto &MemOp = **I.memoperands_begin(); - if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { - LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + if (MemOp.isAtomic()) { + // For now we just support s8 acquire loads to be able to compile stack + // protector code. + if (MemOp.getOrdering() == AtomicOrdering::Acquire && + MemOp.getSize() == 1) { + I.setDesc(TII.get(AArch64::LDARB)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); return false; } unsigned MemSizeInBits = MemOp.getSize() * 8; @@ -1598,7 +1771,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, const unsigned Size = MemSizeInBits / 8; const unsigned Scale = Log2_32(Size); if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { - unsigned Ptr2Reg = PtrMI->getOperand(1).getReg(); + Register Ptr2Reg = PtrMI->getOperand(1).getReg(); I.getOperand(1).setReg(Ptr2Reg); PtrMI = MRI.getVRegDef(Ptr2Reg); Offset = Imm / Size; @@ -1688,8 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return selectVectorSHL(I, MRI); LLVM_FALLTHROUGH; case TargetOpcode::G_OR: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_GEP: { + case TargetOpcode::G_LSHR: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; @@ -1711,6 +1883,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_GEP: { + MachineIRBuilder MIRBuilder(I); + emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), + MIRBuilder); + I.eraseFromParent(); + return true; + } case TargetOpcode::G_UADDO: { // TODO: Support other types. unsigned OpSize = Ty.getSizeInBits(); @@ -1816,6 +1995,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I, constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } + + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } } return false; @@ -1868,21 +2057,41 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_ZEXT: case TargetOpcode::G_SEXT: { unsigned Opcode = I.getOpcode(); - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), - SrcTy = MRI.getType(I.getOperand(1).getReg()); - const bool isSigned = Opcode == TargetOpcode::G_SEXT; + const bool IsSigned = Opcode == TargetOpcode::G_SEXT; const Register DefReg = I.getOperand(0).getReg(); const Register SrcReg = I.getOperand(1).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + const LLT DstTy = MRI.getType(DefReg); + const LLT SrcTy = MRI.getType(SrcReg); + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = SrcTy.getSizeInBits(); - if (RB.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB - << ", expected: GPR\n"); - return false; - } + assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == + AArch64::GPRRegBankID && + "Unexpected ext regbank"); + MachineIRBuilder MIB(I); MachineInstr *ExtI; - if (DstTy == LLT::scalar(64)) { + if (DstTy.isVector()) + return false; // Should be handled by imported patterns. + + // First check if we're extending the result of a load which has a dest type + // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest + // GPR register on AArch64 and all loads which are smaller automatically + // zero-extend the upper bits. E.g. + // %v(s8) = G_LOAD %p, :: (load 1) + // %v2(s32) = G_ZEXT %v(s8) + if (!IsSigned) { + auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); + if (LoadMI && + RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) { + const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); + unsigned BytesLoaded = MemOp->getSize(); + if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) + return selectCopy(I, TII, MRI, TRI, RBI); + } + } + + if (DstSize == 64) { // FIXME: Can we avoid manually doing this? if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) @@ -1890,33 +2099,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } - const Register SrcXReg = - MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) - .addDef(SrcXReg) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::sub_32); + auto SubregToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32); - const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri; - ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc)) - .addDef(DefReg) - .addUse(SrcXReg) - .addImm(0) - .addImm(SrcTy.getSizeInBits() - 1); - } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) { - const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri; - ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc)) - .addDef(DefReg) - .addUse(SrcReg) - .addImm(0) - .addImm(SrcTy.getSizeInBits() - 1); + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, + {DefReg}, {SubregToReg}) + .addImm(0) + .addImm(SrcSize - 1); + } else if (DstSize <= 32) { + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, + {DefReg}, {SrcReg}) + .addImm(0) + .addImm(SrcSize - 1); } else { return false; } constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); - I.eraseFromParent(); return true; } @@ -2163,6 +2365,37 @@ bool AArch64InstructionSelector::selectJumpTable( return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } +bool AArch64InstructionSelector::selectTLSGlobalValue( + MachineInstr &I, MachineRegisterInfo &MRI) const { + if (!STI.isTargetMachO()) + return false; + MachineFunction &MF = *I.getParent()->getParent(); + MF.getFrameInfo().setAdjustsStack(true); + + const GlobalValue &GV = *I.getOperand(1).getGlobal(); + MachineIRBuilder MIB(I); + + MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {}) + .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); + + auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, + {Register(AArch64::X0)}) + .addImm(0); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + MIB.buildInstr(AArch64::BLR, {}, {Load}) + .addDef(AArch64::X0, RegState::Implicit) + .addRegMask(TRI.getTLSCallPreservedMask()); + + MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, + MRI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectIntrinsicTrunc( MachineInstr &I, MachineRegisterInfo &MRI) const { const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); @@ -2478,16 +2711,40 @@ bool AArch64InstructionSelector::selectMergeValues( const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); - // At the moment we only support merging two s32s into an s64. if (I.getNumOperands() != 3) return false; - if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) - return false; - const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + + // Merging 2 s64s into an s128. + if (DstTy == LLT::scalar(128)) { + if (SrcTy.getSizeInBits() != 64) + return false; + MachineIRBuilder MIB(I); + Register DstReg = I.getOperand(0).getReg(); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); + MachineInstr *InsMI = + emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); + if (!InsMI) + return false; + MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), + Src2Reg, /* LaneIdx */ 1, RB, MIB); + if (!Ins2MI) + return false; + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + if (RB.getID() != AArch64::GPRRegBankID) return false; + if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) + return false; + auto *DstRC = &AArch64::GPR64RegClass; Register SubToRegDef = MRI.createVirtualRegister(DstRC); MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), @@ -2695,7 +2952,8 @@ bool AArch64InstructionSelector::selectUnmergeValues( const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); const LLT WideTy = MRI.getType(SrcReg); (void)WideTy; - assert(WideTy.isVector() && "can only unmerge from vector types!"); + assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && + "can only unmerge from vector or s128 types!"); assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && "source register size too small!"); @@ -2802,29 +3060,6 @@ bool AArch64InstructionSelector::selectConcatVectors( return true; } -void AArch64InstructionSelector::collectShuffleMaskIndices( - MachineInstr &I, MachineRegisterInfo &MRI, - SmallVectorImpl<Optional<int>> &Idxs) const { - MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg()); - assert( - MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR && - "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR"); - // Find the constant indices. - for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) { - // Look through copies. - MachineInstr *ScalarDef = - getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI); - assert(ScalarDef && "Could not find vreg def of shufflevec index op"); - if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) { - // This be an undef if not a constant. - assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF); - Idxs.push_back(None); - } else { - Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue()); - } - } -} - unsigned AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const { @@ -2906,6 +3141,31 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { } MachineInstr * +AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri}, + {AArch64::ADDWrr, AArch64::ADDWri}}; + bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(AddMI); + } else { + AddMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI); + return &*AddMI; +} + +MachineInstr * AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); @@ -3151,7 +3411,7 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { // Can't see past copies from physregs. if (Opc == TargetOpcode::COPY && - TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg())) + Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) return false; CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); @@ -3342,16 +3602,9 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { return false; // The shuffle's second operand doesn't matter if the mask is all zero. - auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI); - if (!ZeroVec) - return false; - int64_t Zero = 0; - if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero) + const Constant *Mask = I.getOperand(3).getShuffleMask(); + if (!isa<ConstantAggregateZero>(Mask)) return false; - for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) { - if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg()) - return false; // This wasn't an all zeros vector. - } // We're done, now find out what kind of splat we need. LLT VecTy = MRI.getType(I.getOperand(0).getReg()); @@ -3399,19 +3652,14 @@ bool AArch64InstructionSelector::selectShuffleVector( const LLT Src1Ty = MRI.getType(Src1Reg); Register Src2Reg = I.getOperand(2).getReg(); const LLT Src2Ty = MRI.getType(Src2Reg); + const Constant *ShuffleMask = I.getOperand(3).getShuffleMask(); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); LLVMContext &Ctx = MF.getFunction().getContext(); - // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask - // operand, it comes in as a normal vector value which we have to analyze to - // find the mask indices. If the mask element is undef, then - // collectShuffleMaskIndices() will add a None entry for that index into - // the list. - SmallVector<Optional<int>, 8> Mask; - collectShuffleMaskIndices(I, MRI, Mask); - assert(!Mask.empty() && "Expected to find mask indices"); + SmallVector<int, 8> Mask; + ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask); // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if // it's originated from a <1 x T> type. Those should have been lowered into @@ -3424,10 +3672,10 @@ bool AArch64InstructionSelector::selectShuffleVector( unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; SmallVector<Constant *, 64> CstIdxs; - for (auto &MaybeVal : Mask) { + for (int Val : Mask) { // For now, any undef indexes we'll just assume to be 0. This should be // optimized in future, e.g. to select DUP etc. - int Val = MaybeVal.hasValue() ? *MaybeVal : 0; + Val = Val < 0 ? 0 : Val; for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); @@ -3684,21 +3932,6 @@ static unsigned findIntrinsicID(MachineInstr &I) { return IntrinOp->getIntrinsicID(); } -/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr -/// intrinsic. -static unsigned getStlxrOpcode(unsigned NumBytesToStore) { - switch (NumBytesToStore) { - // TODO: 1, 2, and 4 byte stores. - case 8: - return AArch64::STLXRX; - default: - LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! (" - << NumBytesToStore << ")\n"); - break; - } - return 0; -} - bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineRegisterInfo &MRI) const { // Find the intrinsic ID. @@ -3719,32 +3952,6 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( return false; MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); break; - case Intrinsic::aarch64_stlxr: - Register StatReg = I.getOperand(0).getReg(); - assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 && - "Status register must be 32 bits!"); - Register SrcReg = I.getOperand(2).getReg(); - - if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) { - LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n"); - return false; - } - - Register PtrReg = I.getOperand(3).getReg(); - assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand"); - - // Expect only one memory operand. - if (!I.hasOneMemOperand()) - return false; - - const MachineMemOperand *MemOp = *I.memoperands_begin(); - unsigned NumBytesToStore = MemOp->getSize(); - unsigned Opc = getStlxrOpcode(NumBytesToStore); - if (!Opc) - return false; - - auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg}); - constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI); } I.eraseFromParent(); @@ -3860,6 +4067,30 @@ AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; } +/// Helper to select an immediate value that can be represented as a 12-bit +/// value shifted left by either 0 or 12. If it is possible to do so, return +/// the immediate and shift value. If not, return None. +/// +/// Used by selectArithImmed and selectNegArithImmed. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::select12BitValueWithLeftShift( + uint64_t Immed) const { + unsigned ShiftAmt; + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return None; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, + }}; +} + /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. @@ -3873,22 +4104,229 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == None) return None; + return select12BitValueWithLeftShift(*MaybeImmed); +} + +/// SelectNegArithImmed - As above, but negates the value before trying to +/// select it. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { + // We need a register here, because we need to know if we have a 64 or 32 + // bit immediate. + if (!Root.isReg()) + return None; + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; uint64_t Immed = *MaybeImmed; - unsigned ShiftAmt; - if (Immed >> 12 == 0) { - ShiftAmt = 0; - } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { - ShiftAmt = 12; - Immed = Immed >> 12; - } else + // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" + // have the opposite effect on the C flag, so this pattern mustn't match under + // those circumstances. + if (Immed == 0) return None; - unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, - }}; + // Check if we're dealing with a 32-bit type on the root or a 64-bit type on + // the root. + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + if (MRI.getType(Root.getReg()).getSizeInBits() == 32) + Immed = ~((uint32_t)Immed) + 1; + else + Immed = ~Immed + 1ULL; + + if (Immed & 0xFFFFFFFFFF000000ULL) + return None; + + Immed &= 0xFFFFFFULL; + return select12BitValueWithLeftShift(Immed); +} + +/// Return true if it is worth folding MI into an extended register. That is, +/// if it's safe to pull it into the addressing mode of a load or store as a +/// shift. +bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( + MachineInstr &MI, const MachineRegisterInfo &MRI) const { + // Always fold if there is one use, or if we're optimizing for size. + Register DefReg = MI.getOperand(0).getReg(); + if (MRI.hasOneUse(DefReg) || + MI.getParent()->getParent()->getFunction().hasMinSize()) + return true; + + // It's better to avoid folding and recomputing shifts when we don't have a + // fastpath. + if (!STI.hasLSLFast()) + return false; + + // We have a fastpath, so folding a shift in and potentially computing it + // many times may be beneficial. Check if this is only used in memory ops. + // If it is, then we should fold. + return all_of(MRI.use_instructions(DefReg), + [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3, lsl #3] +/// +/// Where x2 is the base register, and x3 is an offset register. The shift-left +/// is a constant value specific to this load instruction. That is, we'll never +/// see anything other than a 3 here (which corresponds to the size of the +/// element being loaded.) +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( + MachineOperand &Root, unsigned SizeInBytes) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // Make sure that the memory op is a valid size. + int64_t LegalShiftVal = Log2_32(SizeInBytes); + if (LegalShiftVal == 0) + return None; + + // We want to find something like this: + // + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_GEP base_reg shift + // x = G_LOAD ptr + // + // And fold it into this addressing mode: + // + // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] + + // Check if we can find the G_GEP. + MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI); + if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI)) + return None; + + // Now, try to match an opcode which will match our specific offset. + // We want a G_SHL or a G_MUL. + MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI); + if (!OffsetInst) + return None; + + unsigned OffsetOpc = OffsetInst->getOpcode(); + if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) + return None; + + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Now, try to find the specific G_CONSTANT. Start by assuming that the + // register we will offset is the LHS, and the register containing the + // constant is the RHS. + Register OffsetReg = OffsetInst->getOperand(1).getReg(); + Register ConstantReg = OffsetInst->getOperand(2).getReg(); + auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) { + // We didn't get a constant on the RHS. If the opcode is a shift, then + // we're done. + if (OffsetOpc == TargetOpcode::G_SHL) + return None; + + // If we have a G_MUL, we can use either register. Try looking at the RHS. + std::swap(OffsetReg, ConstantReg); + ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) + return None; + } + + // The value must fit into 3 bits, and must be positive. Make sure that is + // true. + int64_t ImmVal = ValAndVReg->Value; + + // Since we're going to pull this into a shift, the constant value must be + // a power of 2. If we got a multiply, then we need to check this. + if (OffsetOpc == TargetOpcode::G_MUL) { + if (!isPowerOf2_32(ImmVal)) + return None; + + // Got a power of 2. So, the amount we'll shift is the log base-2 of that. + ImmVal = Log2_32(ImmVal); + } + + if ((ImmVal & 0x7) != ImmVal) + return None; + + // We are only allowed to shift by LegalShiftVal. This shift value is built + // into the instruction, so we can't just use whatever we want. + if (ImmVal != LegalShiftVal) + return None; + + // We can use the LHS of the GEP as the base, and the LHS of the shift as an + // offset. Signify that we are shifting by setting the shift flag to 1. + return {{[=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(1).getReg()); + }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(0); + MIB.addImm(1); + }}}; +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3] +/// +/// Where x2 is the base register, and x3 is an offset register. +/// +/// When possible (or profitable) to fold a G_GEP into the address calculation, +/// this will do so. Otherwise, it will return None. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeRegisterOffset( + MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We need a GEP. + MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); + if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP) + return None; + + // If this is used more than once, let's not bother folding. + // TODO: Check if they are memory ops. If they are, then we can still fold + // without having to recompute anything. + if (!MRI.hasOneUse(Gep->getOperand(0).getReg())) + return None; + + // Base is the GEP's LHS, offset is its RHS. + return {{[=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(1).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(2).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(0); + MIB.addImm(0); + }}}; +} + +/// This is intended to be equivalent to selectAddrModeXRO in +/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // If we have a constant offset, then we probably don't want to match a + // register offset. + if (isBaseWithConstantOffset(Root, MRI)) + return None; + + // Try to fold shifts into the addressing mode. + auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); + if (AddrModeFns) + return AddrModeFns; + + // If that doesn't work, see if it's possible to fold in registers from + // a GEP. + return selectAddrModeRegisterOffset(Root); } /// Select a "register plus unscaled signed 9-bit immediate" address. This @@ -3994,6 +4432,205 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, }}; } +/// Given a shift instruction, return the correct shift type for that +/// instruction. +static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { + // TODO: Handle AArch64_AM::ROR + switch (MI.getOpcode()) { + default: + return AArch64_AM::InvalidShiftExtend; + case TargetOpcode::G_SHL: + return AArch64_AM::LSL; + case TargetOpcode::G_LSHR: + return AArch64_AM::LSR; + case TargetOpcode::G_ASHR: + return AArch64_AM::ASR; + } +} + +/// Select a "shifted register" operand. If the value is not shifted, set the +/// shift operand to a default value of "lsl 0". +/// +/// TODO: Allow shifted register to be rotated in logical instructions. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + // Check if the operand is defined by an instruction which corresponds to + // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. + // + // TODO: Handle AArch64_AM::ROR for logical instructions. + MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); + if (!ShiftInst) + return None; + AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); + if (ShType == AArch64_AM::InvalidShiftExtend) + return None; + if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) + return None; + + // Need an immediate on the RHS. + MachineOperand &ShiftRHS = ShiftInst->getOperand(2); + auto Immed = getImmedFromMO(ShiftRHS); + if (!Immed) + return None; + + // We have something that we can fold. Fold in the shift's LHS and RHS into + // the instruction. + MachineOperand &ShiftLHS = ShiftInst->getOperand(1); + Register ShiftReg = ShiftLHS.getReg(); + + unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); + unsigned Val = *Immed & (NumBits - 1); + unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; +} + +/// Get the correct ShiftExtendType for an extend instruction. +static AArch64_AM::ShiftExtendType +getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) { + unsigned Opc = MI.getOpcode(); + + // Handle explicit extend instructions first. + if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::SXTB; + case 16: + return AArch64_AM::SXTH; + case 32: + return AArch64_AM::SXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::UXTB; + case 16: + return AArch64_AM::UXTH; + case 32: + return AArch64_AM::UXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + // Don't have an explicit extend. Try to handle a G_AND with a constant mask + // on the RHS. + if (Opc != TargetOpcode::G_AND) + return AArch64_AM::InvalidShiftExtend; + + Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); + if (!MaybeAndMask) + return AArch64_AM::InvalidShiftExtend; + uint64_t AndMask = *MaybeAndMask; + switch (AndMask) { + default: + return AArch64_AM::InvalidShiftExtend; + case 0xFF: + return AArch64_AM::UXTB; + case 0xFFFF: + return AArch64_AM::UXTH; + case 0xFFFFFFFF: + return AArch64_AM::UXTW; + } +} + +Register AArch64InstructionSelector::narrowExtendRegIfNeeded( + Register ExtReg, MachineIRBuilder &MIB) const { + MachineRegisterInfo &MRI = *MIB.getMRI(); + if (MRI.getType(ExtReg).getSizeInBits() == 32) + return ExtReg; + + // Insert a copy to move ExtReg to GPR32. + Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg}); + + // Select the copy into a subregister copy. + selectCopy(*Copy, TII, MRI, TRI, RBI); + return Copy.getReg(0); +} + +/// Select an "extended register" operand. This operand folds in an extend +/// followed by an optional left shift. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithExtendedRegister( + MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + uint64_t ShiftVal = 0; + Register ExtReg; + AArch64_AM::ShiftExtendType Ext; + MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); + if (!RootDef) + return None; + + if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) + return None; + + // Check if we can fold a shift and an extend. + if (RootDef->getOpcode() == TargetOpcode::G_SHL) { + // Look for a constant on the RHS of the shift. + MachineOperand &RHS = RootDef->getOperand(2); + Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); + if (!MaybeShiftVal) + return None; + ShiftVal = *MaybeShiftVal; + if (ShiftVal > 4) + return None; + // Look for a valid extend instruction on the LHS of the shift. + MachineOperand &LHS = RootDef->getOperand(1); + MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); + if (!ExtDef) + return None; + Ext = getExtendTypeForInst(*ExtDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = ExtDef->getOperand(1).getReg(); + } else { + // Didn't get a shift. Try just folding an extend. + Ext = getExtendTypeForInst(*RootDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = RootDef->getOperand(1).getReg(); + + // If we have a 32 bit instruction which zeroes out the high half of a + // register, we get an implicit zero extend for free. Check if we have one. + // FIXME: We actually emit the extend right now even though we don't have + // to. + if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { + MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); + if (ExtInst && isDef32(*ExtInst)) + return None; + } + } + + // We require a GPR32 here. Narrow the ExtReg if needed using a subregister + // copy. + MachineIRBuilder MIB(*RootDef); + ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(getArithExtendImm(Ext, ShiftVal)); + }}}; +} + void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -4003,6 +4640,51 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, MIB.addImm(CstVal.getValue()); } +void AArch64InstructionSelector::renderLogicalImm32( + MachineInstrBuilder &MIB, const MachineInstr &I) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); + MIB.addImm(Enc); +} + +void AArch64InstructionSelector::renderLogicalImm64( + MachineInstrBuilder &MIB, const MachineInstr &I) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); + MIB.addImm(Enc); +} + +bool AArch64InstructionSelector::isLoadStoreOfNumBytes( + const MachineInstr &MI, unsigned NumBytes) const { + if (!MI.mayLoadOrStore()) + return false; + assert(MI.hasOneMemOperand() && + "Expected load/store to have only one mem op!"); + return (*MI.memoperands_begin())->getSize() == NumBytes; +} + +bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) + return false; + + // Only return true if we know the operation will zero-out the high half of + // the 64-bit register. Truncates can be subregister copies, which don't + // zero out the high bits. Copies and other copy-like instructions can be + // fed by truncates, or could be lowered as subregister copies. + switch (MI.getOpcode()) { + default: + return true; + case TargetOpcode::COPY: + case TargetOpcode::G_BITCAST: + case TargetOpcode::G_TRUNC: + case TargetOpcode::G_PHI: + return false; + } +} + namespace llvm { InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &TM, diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index a985b330eafa..7a1901bd5b1e 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -13,7 +13,9 @@ #include "AArch64LegalizerInfo.h" #include "AArch64Subtarget.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" @@ -50,6 +52,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { const LLT v2s64 = LLT::vector(2, 64); const LLT v2p0 = LLT::vector(2, p0); + // FIXME: support subtargets which have neon/fp-armv8 disabled. + if (!ST.hasNEON() || !ST.hasFPARMv8()) { + computeTables(); + return; + } + getActionDefinitionsBuilder(G_IMPLICIT_DEF) .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64}) .clampScalar(0, s1, s64) @@ -74,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_BSWAP) .legalFor({s32, s64, v4s32, v2s32, v2s64}) - .clampScalar(0, s16, s64) + .clampScalar(0, s32, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) @@ -104,6 +112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32, s64}) + .libcallFor({s128}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) .scalarize(0); @@ -115,8 +124,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && AmtTy.getSizeInBits() == 32; }) - .legalFor( - {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}}) + .legalFor({{s32, s32}, + {s32, s64}, + {s64, s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v2s64, v2s64}}) .clampScalar(1, s32, s64) .clampScalar(0, s32, s64) .minScalarSameAs(1, 0); @@ -191,14 +204,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .legalIf([=](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; const LLT &Ty1 = Query.Types[1]; - if (Ty1 != s32 && Ty1 != s64) + if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128) return false; if (Ty1 == p0) return true; return isPowerOf2_32(Ty0.getSizeInBits()) && (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8); }) - .clampScalar(1, s32, s64) + .clampScalar(1, s32, s128) .widenScalarToNextPow2(1) .maxScalarIf(typeInSet(1, {s32}), 0, s16) .maxScalarIf(typeInSet(1, {s64}), 0, s32) @@ -236,6 +249,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {s32, p0, 32, 8}, {s64, p0, 64, 8}, {p0, p0, 64, 8}, + {s128, p0, 128, 8}, {v8s8, p0, 64, 8}, {v16s8, p0, 128, 8}, {v4s16, p0, 64, 8}, @@ -247,14 +261,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}}) .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() + .lowerIfMemSizeNotPow2() // Lower any any-extending loads left into G_ANYEXT and G_LOAD .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; }) + .widenScalarToNextPow2(0) .clampMaxNumElements(0, s32, 2) .clampMaxNumElements(0, s64, 1) .customIf(IsPtrVecPred); @@ -262,9 +274,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_STORE) .legalForTypesWithMemDesc({{s8, p0, 8, 8}, {s16, p0, 16, 8}, + {s32, p0, 8, 8}, + {s32, p0, 16, 8}, {s32, p0, 32, 8}, {s64, p0, 64, 8}, {p0, p0, 64, 8}, + {s128, p0, 128, 8}, {v16s8, p0, 128, 8}, {v4s16, p0, 64, 8}, {v8s16, p0, 128, 8}, @@ -272,10 +287,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {v4s32, p0, 128, 8}, {v2s64, p0, 128, 8}}) .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() + .lowerIfMemSizeNotPow2() .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].isScalar() && Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; @@ -305,8 +317,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {v8s16, v8s16}, {v8s8, v8s8}, {v16s8, v16s8}}) - .clampScalar(0, s32, s32) .clampScalar(1, s32, s64) + .clampScalar(0, s32, s32) .minScalarEltSameAsIf( [=](const LegalityQuery &Query) { const LLT &Ty = Query.Types[0]; @@ -330,33 +342,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .widenScalarToNextPow2(1); // Extensions - getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) - .legalIf([=](const LegalityQuery &Query) { - unsigned DstSize = Query.Types[0].getSizeInBits(); + auto ExtLegalFunc = [=](const LegalityQuery &Query) { + unsigned DstSize = Query.Types[0].getSizeInBits(); - // Make sure that we have something that will fit in a register, and - // make sure it's a power of 2. - if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) - return false; + if (DstSize == 128 && !Query.Types[0].isVector()) + return false; // Extending to a scalar s128 needs narrowing. + + // Make sure that we have something that will fit in a register, and + // make sure it's a power of 2. + if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) + return false; - const LLT &SrcTy = Query.Types[1]; + const LLT &SrcTy = Query.Types[1]; - // Special case for s1. - if (SrcTy == s1) - return true; + // Special case for s1. + if (SrcTy == s1) + return true; - // Make sure we fit in a register otherwise. Don't bother checking that - // the source type is below 128 bits. We shouldn't be allowing anything - // through which is wider than the destination in the first place. - unsigned SrcSize = SrcTy.getSizeInBits(); - if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) - return false; + // Make sure we fit in a register otherwise. Don't bother checking that + // the source type is below 128 bits. We shouldn't be allowing anything + // through which is wider than the destination in the first place. + unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) + return false; - return true; - }); + return true; + }; + getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) + .legalIf(ExtLegalFunc) + .clampScalar(0, s64, s64); // Just for s128, others are handled above. getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + // FP conversions getActionDefinitionsBuilder(G_FPTRUNC).legalFor( {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); @@ -591,6 +610,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return Query.Types[0] == p0 && Query.Types[1] == s64; }); + getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -617,6 +638,24 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, llvm_unreachable("expected switch to return"); } +bool AArch64LegalizerInfo::legalizeIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, MRI, MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + default: + break; + } + return true; +} + bool AArch64LegalizerInfo::legalizeShlAshrLshr( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const { @@ -655,7 +694,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore( // legalized. In order to allow further legalization of the inst, we create // a new instruction and erase the existing one. - unsigned ValReg = MI.getOperand(0).getReg(); + Register ValReg = MI.getOperand(0).getReg(); const LLT ValTy = MRI.getType(ValReg); if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || @@ -672,7 +711,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore( auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg}); MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO); } else { - unsigned NewReg = MRI.createGenericVirtualRegister(NewTy); + Register NewReg = MRI.createGenericVirtualRegister(NewTy); auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO); MIRBuilder.buildBitcast({ValReg}, {NewLoad}); } diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h index f3362a18620f..15161bab466c 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -31,6 +31,9 @@ public: MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const override; + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + private: bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const; diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 65b5f906e3f6..a0c4a25bb5b9 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -201,8 +201,22 @@ static bool isNarrowStore(unsigned Opc) { } } +// These instruction set memory tag and either keep memory contents unchanged or +// set it to zero, ignoring the address part of the source register. +static bool isTagStore(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return true; + } +} + // Scaling factor for unscaled load or store. -static int getMemScale(MachineInstr &MI) { +static int getMemScale(const MachineInstr &MI) { switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); @@ -255,6 +269,11 @@ static int getMemScale(MachineInstr &MI) { case AArch64::STURQi: case AArch64::LDPQi: case AArch64::STPQi: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: return 16; } } @@ -449,6 +468,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STPWpre; case AArch64::STPXi: return AArch64::STPXpre; + case AArch64::STGOffset: + return AArch64::STGPreIndex; + case AArch64::STZGOffset: + return AArch64::STZGPreIndex; + case AArch64::ST2GOffset: + return AArch64::ST2GPreIndex; + case AArch64::STZ2GOffset: + return AArch64::STZ2GPreIndex; + case AArch64::STGPi: + return AArch64::STGPpre; } } @@ -518,6 +547,16 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STPWpost; case AArch64::STPXi: return AArch64::STPXpost; + case AArch64::STGOffset: + return AArch64::STGPostIndex; + case AArch64::STZGOffset: + return AArch64::STZGPostIndex; + case AArch64::ST2GOffset: + return AArch64::ST2GPostIndex; + case AArch64::STZ2GOffset: + return AArch64::STZ2GPostIndex; + case AArch64::STGPi: + return AArch64::STGPpost; } } @@ -536,10 +575,30 @@ static bool isPairedLdSt(const MachineInstr &MI) { case AArch64::STPQi: case AArch64::STPWi: case AArch64::STPXi: + case AArch64::STGPi: return true; } } +// Returns the scale and offset range of pre/post indexed variants of MI. +static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, + int &MinOffset, int &MaxOffset) { + bool IsPaired = isPairedLdSt(MI); + bool IsTagStore = isTagStore(MI); + // ST*G and all paired ldst have the same scale in pre/post-indexed variants + // as in the "unsigned offset" variant. + // All other pre/post indexed ldst instructions are unscaled. + Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1; + + if (IsPaired) { + MinOffset = -64; + MaxOffset = 63; + } else { + MinOffset = -256; + MaxOffset = 255; + } +} + static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); @@ -618,6 +677,11 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { case AArch64::LDRWui: case AArch64::LDRHHui: case AArch64::LDRBBui: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: @@ -808,7 +872,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // STRWui %w1, ... // USE kill %w1 ; need to clear kill flag when moving STRWui downwards // STRW %w0 - unsigned Reg = getLdStRegOp(*I).getReg(); + Register Reg = getLdStRegOp(*I).getReg(); for (MachineInstr &MI : make_range(std::next(I), Paired)) MI.clearRegisterKills(Reg, TRI); } @@ -837,9 +901,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineOperand &DstMO = MIB->getOperand(SExtIdx); // Right now, DstMO has the extended register, since it comes from an // extended opcode. - unsigned DstRegX = DstMO.getReg(); + Register DstRegX = DstMO.getReg(); // Get the W variant of that register. - unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); + Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); // Update the result of LDP to use the W instead of the X variant. DstMO.setReg(DstRegW); LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); @@ -882,9 +946,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, int LoadSize = getMemScale(*LoadI); int StoreSize = getMemScale(*StoreI); - unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + Register LdRt = getLdStRegOp(*LoadI).getReg(); const MachineOperand &StMO = getLdStRegOp(*StoreI); - unsigned StRt = getLdStRegOp(*StoreI).getReg(); + Register StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); assert((IsStoreXReg || @@ -933,10 +997,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, ? getLdStOffsetOp(*StoreI).getImm() : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; - unsigned DestReg = IsStoreXReg - ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, - &AArch64::GPR64RegClass) - : LdRt; + unsigned DestReg = + IsStoreXReg ? Register(TRI->getMatchingSuperReg( + LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) + : LdRt; assert((UnscaledLdOffset >= UnscaledStOffset && (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && @@ -1042,7 +1106,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; MachineInstr &LoadMI = *I; - unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); + Register BaseReg = getLdStBaseOp(LoadMI).getReg(); // If the load is the first instruction in the block, there's obviously // not any matching store. @@ -1156,8 +1220,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); - unsigned Reg = getLdStRegOp(FirstMI).getReg(); - unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + Register Reg = getLdStRegOp(FirstMI).getReg(); + Register BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); @@ -1188,7 +1252,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); + Register MIBaseReg = getLdStBaseOp(MI).getReg(); int MIOffset = getLdStOffsetOp(MI).getImm(); bool MIIsUnscaled = TII->isUnscaledLdSt(MI); if (IsUnscaled != MIIsUnscaled) { @@ -1328,18 +1392,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) .add(getLdStBaseOp(*I)) - .addImm(Value) + .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); } else { // Paired instruction. - int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I, 0)) @@ -1395,28 +1460,21 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MI.getOperand(1).getReg() != BaseReg) break; - bool IsPairedInsn = isPairedLdSt(MemMI); int UpdateOffset = MI.getOperand(2).getImm(); if (MI.getOpcode() == AArch64::SUBXri) UpdateOffset = -UpdateOffset; - // For non-paired load/store instructions, the immediate must fit in a - // signed 9-bit integer. - if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + // The immediate must be a multiple of the scaling factor of the pre/post + // indexed instruction. + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); + if (UpdateOffset % Scale != 0) break; - // For paired load/store instructions, the immediate must be a multiple of - // the scaling factor. The scaled offset must also fit into a signed 7-bit - // integer. - if (IsPairedInsn) { - int Scale = getMemScale(MemMI); - if (UpdateOffset % Scale != 0) - break; - - int ScaledOffset = UpdateOffset / Scale; - if (ScaledOffset > 63 || ScaledOffset < -64) - break; - } + // Scaled offset must fit in the instruction immediate. + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) + break; // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. @@ -1433,7 +1491,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + Register BaseReg = getLdStBaseOp(MemMI).getReg(); int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions @@ -1442,13 +1500,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( if (MIUnscaledOffset != UnscaledOffset) return E; - // If the base register overlaps a destination register, we can't - // merge the update. - bool IsPairedInsn = isPairedLdSt(MemMI); - for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { - unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + // If the base register overlaps a source/destination register, we can't + // merge the update. This does not apply to tag store instructions which + // ignore the address part of the source register. + // This does not apply to STGPi as well, which does not have unpredictable + // behavior in this case unlike normal stores, and always performs writeback + // after reading the source register value. + if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + Register DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } } // Track which register units have been modified and used between the first @@ -1487,7 +1551,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + Register BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously @@ -1496,11 +1560,13 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; // If the base register overlaps a destination register, we can't // merge the update. - bool IsPairedInsn = isPairedLdSt(MemMI); - for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { - unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + if (!isTagStore(MemMI)) { + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + Register DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } } // Track which register units have been modified and used between the first @@ -1659,7 +1725,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate // however, is not, so adjust here. int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); - // Look forward to try to find a post-index instruction. For example, + // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index e7d4a2789a28..afd5ae6bcbf2 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -148,6 +148,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, RefFlags |= AArch64MCExpr::VK_TLSDESC; break; } + } else if (MO.getTargetFlags() & AArch64II::MO_PREL) { + RefFlags |= AArch64MCExpr::VK_PREL; } else { // No modifier means this is a generic reference, classified as absolute for // the cases where it matters (:abs_g0: etc). diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0efeeb272ec1..0009fb7b5520 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCLinkerOptimizationHint.h" #include <cassert> @@ -95,6 +96,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. unsigned SRetReturnReg = 0; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. @@ -131,6 +139,15 @@ public: ArgumentStackToRestore = bytes; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + void setStackSizeSVE(uint64_t S) { + HasCalculatedStackSizeSVE = true; + StackSizeSVE = S; + } + + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index aff861aae6be..d503c39b1f90 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -162,11 +162,11 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, LiveIntervals &LIs = G.getMetadata().LIS; - if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) { - LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd) - << '\n'); - LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra) - << '\n'); + if (Register::isPhysicalRegister(Rd) || Register::isPhysicalRegister(Ra)) { + LLVM_DEBUG(dbgs() << "Rd is a physical reg:" + << Register::isPhysicalRegister(Rd) << '\n'); + LLVM_DEBUG(dbgs() << "Ra is a physical reg:" + << Register::isPhysicalRegister(Ra) << '\n'); return false; } @@ -359,8 +359,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) { case AArch64::FMADDDrrr: case AArch64::FNMSUBDrrr: case AArch64::FNMADDDrrr: { - unsigned Rd = MI.getOperand(0).getReg(); - unsigned Ra = MI.getOperand(3).getReg(); + Register Rd = MI.getOperand(0).getReg(); + Register Ra = MI.getOperand(3).getReg(); if (addIntraChainConstraint(G, Rd, Ra)) addInterChainConstraint(G, Rd, Ra); @@ -369,7 +369,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) { case AArch64::FMLAv2f32: case AArch64::FMLSv2f32: { - unsigned Rd = MI.getOperand(0).getReg(); + Register Rd = MI.getOperand(0).getReg(); addInterChainConstraint(G, Rd, Rd); break; } diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 5f7245bfbd74..d30ea120bae4 100644 --- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -15,7 +15,9 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" @@ -25,12 +27,31 @@ using namespace llvm; using namespace MIPatternMatch; +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + class AArch64PreLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + public: - AArch64PreLegalizerCombinerInfo() + AArch64GenPreLegalizerCombinerHelper Generated; + + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!Generated.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; @@ -38,24 +59,50 @@ public: bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B); + CombinerHelper Helper(Observer, B, KB, MDT); switch (MI.getOpcode()) { - default: - return false; - case TargetOpcode::COPY: - return Helper.tryCombineCopy(MI); - case TargetOpcode::G_BR: - return Helper.tryCombineBr(MI); + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); case TargetOpcode::G_LOAD: case TargetOpcode::G_SEXTLOAD: - case TargetOpcode::G_ZEXTLOAD: - return Helper.tryCombineExtendingLoads(MI); + case TargetOpcode::G_ZEXTLOAD: { + bool Changed = false; + Changed |= Helper.tryCombineExtendingLoads(MI); + Changed |= Helper.tryCombineIndexedLoadStore(MI); + return Changed; + } + case TargetOpcode::G_STORE: + return Helper.tryCombineIndexedLoadStore(MI); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; + // Try to inline memcpy type calls if optimizations are enabled. + return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) + : false; + } + default: + break; + } } + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + return false; } +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + // Pass boilerplate // ================ @@ -63,24 +110,33 @@ class AArch64PreLegalizerCombiner : public MachineFunctionPass { public: static char ID; - AArch64PreLegalizerCombiner(); + AArch64PreLegalizerCombiner(bool IsOptNone = false); StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; }; -} +} // end anonymous namespace void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetPassConfig>(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } MachineFunctionPass::getAnalysisUsage(AU); } -AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() : MachineFunctionPass(ID) { +AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); } @@ -89,7 +145,14 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { MachineFunctionProperties::Property::FailedISel)) return false; auto *TPC = &getAnalysis<TargetPassConfig>(); - AArch64PreLegalizerCombinerInfo PCInfo; + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } @@ -99,13 +162,14 @@ INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, false) namespace llvm { -FunctionPass *createAArch64PreLegalizeCombiner() { - return new AArch64PreLegalizerCombiner(); +FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) { + return new AArch64PreLegalizerCombiner(IsOptNone); } } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index b52259cc9acd..8ec73aa3c040 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -563,12 +563,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getSameKindOfOperandsMapping(MI); } case TargetOpcode::COPY: { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); // Check if one of the register is not a generic register. - if ((TargetRegisterInfo::isPhysicalRegister(DstReg) || + if ((Register::isPhysicalRegister(DstReg) || !MRI.getType(DstReg).isValid()) || - (TargetRegisterInfo::isPhysicalRegister(SrcReg) || + (Register::isPhysicalRegister(SrcReg) || !MRI.getType(SrcReg).isValid())) { const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI); const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI); @@ -635,6 +635,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Some of the floating-point instructions have mixed GPR and FPR operands: // fine-tune the computed mapping. switch (Opc) { + case TargetOpcode::G_TRUNC: { + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + break; + } case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: if (MRI.getType(MI.getOperand(0).getReg()).isVector()) @@ -687,7 +693,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_STORE: // Check if that store is fed by fp instructions. if (OpRegBankIdx[0] == PMI_FirstGPR) { - unsigned VReg = MI.getOperand(0).getReg(); + Register VReg = MI.getOperand(0).getReg(); if (!VReg) break; MachineInstr *DefMI = MRI.getVRegDef(VReg); @@ -702,11 +708,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; // If we're taking in vectors, we have no choice but to put everything on - // FPRs. + // FPRs, except for the condition. The condition must always be on a GPR. LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); if (SrcTy.isVector()) { - for (unsigned Idx = 0; Idx < 4; ++Idx) - OpRegBankIdx[Idx] = PMI_FirstFPR; + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; break; } @@ -740,7 +745,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // This doesn't check the condition, since it's just whatever is in NZCV. // This isn't passed explicitly in a register to fcsel/csel. for (unsigned Idx = 2; Idx < 4; ++Idx) { - unsigned VReg = MI.getOperand(Idx).getReg(); + Register VReg = MI.getOperand(Idx).getReg(); MachineInstr *DefMI = MRI.getVRegDef(VReg); if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank || onlyDefinesFP(*DefMI, MRI, TRI)) @@ -750,8 +755,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // If we have more FP constraints than not, then move everything over to // FPR. if (NumFP >= 2) - for (unsigned Idx = 0; Idx < 4; ++Idx) - OpRegBankIdx[Idx] = PMI_FirstFPR; + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; break; } @@ -764,7 +768,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg()); // UNMERGE into scalars from a vector should always use FPR. // Likewise if any of the uses are FP instructions. - if (SrcTy.isVector() || + if (SrcTy.isVector() || SrcTy == LLT::scalar(128) || any_of(MRI.use_instructions(MI.getOperand(0).getReg()), [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { // Set the register bank of every operand to FPR. @@ -795,12 +799,21 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Index needs to be a GPR. OpRegBankIdx[3] = PMI_FirstGPR; break; + case TargetOpcode::G_EXTRACT: { + // For s128 sources we have to use fpr. + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (SrcTy.getSizeInBits() == 128) { + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + } + break; + } case TargetOpcode::G_BUILD_VECTOR: // If the first source operand belongs to a FPR register bank, then make // sure that we preserve that. if (OpRegBankIdx[1] != PMI_FirstGPR) break; - unsigned VReg = MI.getOperand(1).getReg(); + Register VReg = MI.getOperand(1).getReg(); if (!VReg) break; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 6d5a4e3d2f76..de176088595d 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64StackOffset.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -23,10 +24,10 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -63,8 +64,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_SaveList; - else - return CSR_AArch64_AAPCS_SaveList; + if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) + return CSR_Darwin_AArch64_AAPCS_SaveList; + return CSR_AArch64_AAPCS_SaveList; } const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( @@ -120,6 +122,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, : CSR_AArch64_CXX_TLS_Darwin_RegMask; if (CC == CallingConv::AArch64_VectorCall) return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; + if (CC == CallingConv::AArch64_SVE_VectorCall) + return CSR_AArch64_SVE_AAPCS_RegMask; if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering() ->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) @@ -388,7 +392,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const { assert(Offset <= INT_MAX && "Offset too big to fit in int."); assert(MI && "Unable to get the legal offset for nil instruction."); - int SaveOffset = Offset; + StackOffset SaveOffset(Offset, MVT::i8); return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal; } @@ -418,7 +422,9 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const { - int Off = Offset; // ARM doesn't need the general 64-bit offsets + // ARM doesn't need the general 64-bit offsets + StackOffset Off(Offset, MVT::i8); + unsigned i = 0; while (!MI.getOperand(i).isFI()) { @@ -441,40 +447,69 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64InstrInfo *TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); const AArch64FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + bool Tagged = + MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED; unsigned FrameReg; - int Offset; // Special handling of dbg_value, stackmap and patchpoint instructions. if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || MI.getOpcode() == TargetOpcode::PATCHPOINT) { - Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, - /*PreferFP=*/true, - /*ForSimm=*/false); - Offset += MI.getOperand(FIOperandNum + 1).getImm(); + StackOffset Offset = + TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, + /*PreferFP=*/true, + /*ForSimm=*/false); + Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8); MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); - MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes()); return; } if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); + int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); FI.ChangeToImmediate(Offset); return; } + StackOffset Offset; if (MI.getOpcode() == AArch64::TAGPstack) { // TAGPstack must use the virtual frame register in its 3rd operand. - const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); FrameReg = MI.getOperand(3).getReg(); - Offset = - MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset(); + Offset = {MFI.getObjectOffset(FrameIndex) + + AFI->getTaggedBasePointerOffset(), + MVT::i8}; + } else if (Tagged) { + StackOffset SPOffset = { + MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8}; + if (MFI.hasVarSizedObjects() || + isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) != + (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) { + // Can't update to SP + offset in place. Precalculate the tagged pointer + // in a scratch register. + Offset = TFI->resolveFrameIndexReference( + MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, + TII); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(0); + MI.getOperand(FIOperandNum) + .ChangeToRegister(ScratchReg, false, false, true); + return; + } + FrameReg = AArch64::SP; + Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), + MVT::i8}; } else { Offset = TFI->resolveFrameIndexReference( MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); @@ -490,7 +525,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - unsigned ScratchReg = + Register ScratchReg = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 854670079e40..28a7e680849b 100644 --- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -426,16 +426,16 @@ bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // Get the operands of the current SIMD arithmetic instruction. - unsigned MulDest = MI.getOperand(0).getReg(); - unsigned SrcReg0 = MI.getOperand(1).getReg(); + Register MulDest = MI.getOperand(0).getReg(); + Register SrcReg0 = MI.getOperand(1).getReg(); unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); - unsigned SrcReg1 = MI.getOperand(2).getReg(); + Register SrcReg1 = MI.getOperand(2).getReg(); unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); unsigned DupDest; // Instructions of interest have either 4 or 5 operands. if (MI.getNumOperands() == 5) { - unsigned SrcReg2 = MI.getOperand(3).getReg(); + Register SrcReg2 = MI.getOperand(3).getReg(); unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); unsigned LaneNumber = MI.getOperand(4).getImm(); // Create a new DUP instruction. Note that if an equivalent DUP instruction diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index 79ab42f4c080..b573eac76754 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -82,11 +82,11 @@ let Predicates = [HasSVE] in { defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">; defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">; - defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">; - defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">; + defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>; + defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>; - defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">; - defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">; + defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; + defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">; defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">; @@ -94,14 +94,14 @@ let Predicates = [HasSVE] in { defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">; defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">; defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">; - defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">; - defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">; + defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>; + defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>; - defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">; - defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">; - defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">; - defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">; - defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">; + defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>; + defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>; + defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>; + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>; + defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>; defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">; @@ -138,12 +138,12 @@ let Predicates = [HasSVE] in { defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">; + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>; defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; @@ -187,7 +187,7 @@ let Predicates = [HasSVE] in { defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">; // Splat scalar register (unpredicated, GPR or vector + element index) - defm DUP_ZR : sve_int_perm_dup_r<"dup">; + defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>; defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) @@ -211,13 +211,13 @@ let Predicates = [HasSVE] in { defm REV_PP : sve_int_perm_reverse_p<"rev">; defm REV_ZZ : sve_int_perm_reverse_z<"rev">; - defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">; - defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">; - defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">; - defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">; + defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>; + defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>; + defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>; + defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>; - def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">; - def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">; + defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>; + defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>; defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">; defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">; @@ -1020,6 +1020,56 @@ let Predicates = [HasSVE] in { (FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>; def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; + + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + + def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + + def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + + def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; + + def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + + def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + + def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + } let Predicates = [HasSVE2] in { @@ -1164,6 +1214,13 @@ let Predicates = [HasSVE2] in { defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">; defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">; + // SVE2 predicated shifts + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; + defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; + // SVE2 integer add/subtract long defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">; defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">; @@ -1199,14 +1256,14 @@ let Predicates = [HasSVE2] in { defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">; // SVE2 bitwise shift and insert - defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">; - defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">; + defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">; + defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">; // SVE2 bitwise shift right and accumulate - defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">; - defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">; - defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">; - defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">; + defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">; + defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">; + defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">; + defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">; // SVE2 complex integer add defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">; @@ -1228,41 +1285,47 @@ let Predicates = [HasSVE2] in { defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">; defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">; - // SVE2 bitwise shift right narrow - defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">; - defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">; - defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">; - defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">; - defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">; - defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">; - defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">; - defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">; - defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">; - defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">; - defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">; - defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">; - defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">; - defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">; - defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">; - defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">; + // SVE2 bitwise shift right narrow (bottom) + defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">; + defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">; + defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">; + defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">; + defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">; + defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">; + defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">; + defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">; + + // SVE2 bitwise shift right narrow (top) + defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">; + defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">; + defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">; + defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">; + defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">; + defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">; + defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">; + defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">; + + // SVE2 integer add/subtract narrow high part (bottom) + defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">; + defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">; + defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">; + defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">; - // SVE2 integer add/subtract narrow high part - defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">; - defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">; - defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">; - defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">; - defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">; - defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">; - defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">; - defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">; + // SVE2 integer add/subtract narrow high part (top) + defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">; + defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">; + defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">; + defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">; - // SVE2 saturating extract narrow - defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">; - defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">; - defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">; - defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">; - defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">; - defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">; + // SVE2 saturating extract narrow (bottom) + defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">; + defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">; + defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">; + + // SVE2 saturating extract narrow (top) + defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">; + defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">; + defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">; // SVE2 character match defm MATCH_PPzZZ : sve2_char_match<0b0, "match">; @@ -1289,10 +1352,14 @@ let Predicates = [HasSVE2] in { // SVE2 histogram generation (vector) defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; + // SVE2 floating-point base 2 logarithm as integer + defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; + // SVE2 floating-point convert precision defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">; defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">; defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">; + def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>; // SVE2 floating-point pairwise operations defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">; @@ -1321,58 +1388,45 @@ let Predicates = [HasSVE2] in { def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">; def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">; - // sve_int_rotate_imm + // SVE2 bitwise xor and rotate right by immediate defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">; // SVE2 extract vector (immediate offset, constructive) def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; - // SVE floating-point convert precision - def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>; - - // SVE floating-point convert to integer - defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; - - // Non-temporal contiguous loads (vector + register) - defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; + // SVE2 non-temporal gather loads + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; - // Predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; - defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; - - // Non-temporal contiguous stores (vector + register) - defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; + // SVE2 non-temporal scatter stores + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; - defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; - // SVE table lookup (three sources) + // SVE2 table lookup (three sources) defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">; defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">; - // SVE integer compare scalar count and limit + // SVE2 integer compare scalar count and limit defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">; defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">; defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">; @@ -1383,7 +1437,7 @@ let Predicates = [HasSVE2] in { defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">; defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">; - // SVE pointer conflict compare + // SVE2 pointer conflict compare defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">; defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">; } diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 60dbace03ca6..ba61ed726e84 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -32,7 +32,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( const AArch64TargetLowering &TLI = *STI.getTargetLowering(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); - Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp index 3087e6ce441d..7307961ddb5f 100644 --- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -106,6 +106,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" #include <cassert> @@ -115,9 +116,9 @@ using namespace llvm; #define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass" -cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden, - cl::desc("Sanitize loads from memory."), - cl::init(true)); +static cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden, + cl::desc("Sanitize loads from memory."), + cl::init(true)); namespace { @@ -521,7 +522,7 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) { for (auto Use : MI.uses()) { if (!Use.isReg()) continue; - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); // Some loads of floating point data have implicit defs/uses on a // super register of that floating point data. Some examples: // $s0 = LDRSui $sp, 22, implicit-def $q0 @@ -561,8 +562,8 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue( // miss-speculation isn't happening because we're already inserting barriers // to guarantee that. if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); // Mark this register and all its aliasing registers as needing to be // value speculation hardened before its next use, by using a CSDB // barrier instruction. diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h new file mode 100644 index 000000000000..13f12a6c9c30 --- /dev/null +++ b/lib/Target/AArch64/AArch64StackOffset.h @@ -0,0 +1,138 @@ +//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the StackOffset class, which is used to +// describe scalable and non-scalable offsets during frame lowering. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H + +#include "llvm/Support/MachineValueType.h" + +namespace llvm { + +/// StackOffset is a wrapper around scalable and non-scalable offsets and is +/// used in several functions such as 'isAArch64FrameOffsetLegal' and +/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g. +// +/// StackOffset(1, MVT::nxv16i8) +// +/// would describe an offset as being the size of a single SVE vector. +/// +/// The class also implements simple arithmetic (addition/subtraction) on these +/// offsets, e.g. +// +/// StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64) +// +/// describes an offset that spans the combined storage required for an SVE +/// vector and a 64bit GPR. +class StackOffset { + int64_t Bytes; + int64_t ScalableBytes; + + explicit operator int() const; + +public: + using Part = std::pair<int64_t, MVT>; + + StackOffset() : Bytes(0), ScalableBytes(0) {} + + StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { + assert(MVT(T).getSizeInBits() % 8 == 0 && + "Offset type is not a multiple of bytes"); + *this += Part(Offset, T); + } + + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {} + + StackOffset &operator=(const StackOffset &) = default; + + StackOffset &operator+=(const StackOffset::Part &Other) { + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; + return *this; + } + + StackOffset &operator+=(const StackOffset &Other) { + Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; + return *this; + } + + StackOffset operator+(const StackOffset &Other) const { + StackOffset Res(*this); + Res += Other; + return Res; + } + + StackOffset &operator-=(const StackOffset &Other) { + Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; + return *this; + } + + StackOffset operator-(const StackOffset &Other) const { + StackOffset Res(*this); + Res -= Other; + return Res; + } + + StackOffset operator-() const { + StackOffset Res = {}; + const StackOffset Other(*this); + Res -= Other; + return Res; + } + + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + + /// Returns the non-scalable part of the offset in bytes. + int64_t getBytes() const { return Bytes; } + + /// Returns the offset in parts to which this frame offset can be + /// decomposed for the purpose of describing a frame offset. + /// For non-scalable offsets this is simply its byte size. + void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) const { + assert(isValid() && "Invalid frame offset"); + + NumBytes = Bytes; + NumDataVectors = 0; + NumPredicateVectors = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } + } + + /// Returns whether the offset is known zero. + explicit operator bool() const { return Bytes || ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp index 6e99c48bf1d7..e6dbe01d3807 100644 --- a/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/lib/Target/AArch64/AArch64StackTagging.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -55,9 +56,215 @@ using namespace llvm; #define DEBUG_TYPE "stack-tagging" -static constexpr unsigned kTagGranuleSize = 16; +static cl::opt<bool> ClMergeInit( + "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, + cl::desc("merge stack variable initializers with tagging when possible")); + +static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit", + cl::init(40), cl::Hidden); + +static const Align kTagGranuleSize = Align(16); namespace { + +class InitializerBuilder { + uint64_t Size; + const DataLayout *DL; + Value *BasePtr; + Function *SetTagFn; + Function *SetTagZeroFn; + Function *StgpFn; + + // List of initializers sorted by start offset. + struct Range { + uint64_t Start, End; + Instruction *Inst; + }; + SmallVector<Range, 4> Ranges; + // 8-aligned offset => 8-byte initializer + // Missing keys are zero initialized. + std::map<uint64_t, Value *> Out; + +public: + InitializerBuilder(uint64_t Size, const DataLayout *DL, Value *BasePtr, + Function *SetTagFn, Function *SetTagZeroFn, + Function *StgpFn) + : Size(Size), DL(DL), BasePtr(BasePtr), SetTagFn(SetTagFn), + SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {} + + bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) { + auto I = std::lower_bound( + Ranges.begin(), Ranges.end(), Start, + [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; }); + if (I != Ranges.end() && End > I->Start) { + // Overlap - bail. + return false; + } + Ranges.insert(I, {Start, End, Inst}); + return true; + } + + bool addStore(uint64_t Offset, StoreInst *SI, const DataLayout *DL) { + int64_t StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType()); + if (!addRange(Offset, Offset + StoreSize, SI)) + return false; + IRBuilder<> IRB(SI); + applyStore(IRB, Offset, Offset + StoreSize, SI->getOperand(0)); + return true; + } + + bool addMemSet(uint64_t Offset, MemSetInst *MSI) { + uint64_t StoreSize = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if (!addRange(Offset, Offset + StoreSize, MSI)) + return false; + IRBuilder<> IRB(MSI); + applyMemSet(IRB, Offset, Offset + StoreSize, + cast<ConstantInt>(MSI->getValue())); + return true; + } + + void applyMemSet(IRBuilder<> &IRB, int64_t Start, int64_t End, + ConstantInt *V) { + // Out[] does not distinguish between zero and undef, and we already know + // that this memset does not overlap with any other initializer. Nothing to + // do for memset(0). + if (V->isZero()) + return; + for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) { + uint64_t Cst = 0x0101010101010101UL; + int LowBits = Offset < Start ? (Start - Offset) * 8 : 0; + if (LowBits) + Cst = (Cst >> LowBits) << LowBits; + int HighBits = End - Offset < 8 ? (8 - (End - Offset)) * 8 : 0; + if (HighBits) + Cst = (Cst << HighBits) >> HighBits; + ConstantInt *C = + ConstantInt::get(IRB.getInt64Ty(), Cst * V->getZExtValue()); + + Value *&CurrentV = Out[Offset]; + if (!CurrentV) { + CurrentV = C; + } else { + CurrentV = IRB.CreateOr(CurrentV, C); + } + } + } + + // Take a 64-bit slice of the value starting at the given offset (in bytes). + // Offset can be negative. Pad with zeroes on both sides when necessary. + Value *sliceValue(IRBuilder<> &IRB, Value *V, int64_t Offset) { + if (Offset > 0) { + V = IRB.CreateLShr(V, Offset * 8); + V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty()); + } else if (Offset < 0) { + V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty()); + V = IRB.CreateShl(V, -Offset * 8); + } else { + V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty()); + } + return V; + } + + void applyStore(IRBuilder<> &IRB, int64_t Start, int64_t End, + Value *StoredValue) { + StoredValue = flatten(IRB, StoredValue); + for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) { + Value *V = sliceValue(IRB, StoredValue, Offset - Start); + Value *&CurrentV = Out[Offset]; + if (!CurrentV) { + CurrentV = V; + } else { + CurrentV = IRB.CreateOr(CurrentV, V); + } + } + } + + void generate(IRBuilder<> &IRB) { + LLVM_DEBUG(dbgs() << "Combined initializer\n"); + // No initializers => the entire allocation is undef. + if (Ranges.empty()) { + emitUndef(IRB, 0, Size); + return; + } + + // Look through 8-byte initializer list 16 bytes at a time; + // If one of the two 8-byte halfs is non-zero non-undef, emit STGP. + // Otherwise, emit zeroes up to next available item. + uint64_t LastOffset = 0; + for (uint64_t Offset = 0; Offset < Size; Offset += 16) { + auto I1 = Out.find(Offset); + auto I2 = Out.find(Offset + 8); + if (I1 == Out.end() && I2 == Out.end()) + continue; + + if (Offset > LastOffset) + emitZeroes(IRB, LastOffset, Offset - LastOffset); + + Value *Store1 = I1 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty()) + : I1->second; + Value *Store2 = I2 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty()) + : I2->second; + emitPair(IRB, Offset, Store1, Store2); + LastOffset = Offset + 16; + } + + // memset(0) does not update Out[], therefore the tail can be either undef + // or zero. + if (LastOffset < Size) + emitZeroes(IRB, LastOffset, Size - LastOffset); + + for (const auto &R : Ranges) { + R.Inst->eraseFromParent(); + } + } + + void emitZeroes(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) { + LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size + << ") zero\n"); + Value *Ptr = BasePtr; + if (Offset) + Ptr = IRB.CreateConstGEP1_32(Ptr, Offset); + IRB.CreateCall(SetTagZeroFn, + {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); + } + + void emitUndef(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) { + LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size + << ") undef\n"); + Value *Ptr = BasePtr; + if (Offset) + Ptr = IRB.CreateConstGEP1_32(Ptr, Offset); + IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); + } + + void emitPair(IRBuilder<> &IRB, uint64_t Offset, Value *A, Value *B) { + LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + 16 << "):\n"); + LLVM_DEBUG(dbgs() << " " << *A << "\n " << *B << "\n"); + Value *Ptr = BasePtr; + if (Offset) + Ptr = IRB.CreateConstGEP1_32(Ptr, Offset); + IRB.CreateCall(StgpFn, {Ptr, A, B}); + } + + Value *flatten(IRBuilder<> &IRB, Value *V) { + if (V->getType()->isIntegerTy()) + return V; + // vector of pointers -> vector of ints + if (VectorType *VecTy = dyn_cast<VectorType>(V->getType())) { + LLVMContext &Ctx = IRB.getContext(); + Type *EltTy = VecTy->getElementType(); + if (EltTy->isPointerTy()) { + uint32_t EltSize = DL->getTypeSizeInBits(EltTy); + Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize), + VecTy->getNumElements()); + V = IRB.CreatePointerCast(V, NewTy); + } + } + return IRB.CreateBitOrPointerCast( + V, IRB.getIntNTy(DL->getTypeStoreSize(V->getType()) * 8)); + } +}; + class AArch64StackTagging : public FunctionPass { struct AllocaInfo { AllocaInst *AI; @@ -67,10 +274,15 @@ class AArch64StackTagging : public FunctionPass { int Tag; // -1 for non-tagged allocations }; + bool MergeInit; + public: static char ID; // Pass ID, replacement for typeid - AArch64StackTagging() : FunctionPass(ID) { + AArch64StackTagging(bool MergeInit = true) + : FunctionPass(ID), + MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit + : MergeInit) { initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry()); } @@ -81,6 +293,9 @@ public: uint64_t Size); void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size); + Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr, + uint64_t Size, InitializerBuilder &IB); + Instruction * insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas, const DominatorTree *DT); @@ -92,9 +307,12 @@ private: Function *F; Function *SetTagFunc; const DataLayout *DL; + AAResults *AA; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (MergeInit) + AU.addRequired<AAResultsWrapperPass>(); } }; @@ -107,8 +325,68 @@ INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", false, false) -FunctionPass *llvm::createAArch64StackTaggingPass() { - return new AArch64StackTagging(); +FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) { + return new AArch64StackTagging(MergeInit); +} + +Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst, + Value *StartPtr, + uint64_t Size, + InitializerBuilder &IB) { + MemoryLocation AllocaLoc{StartPtr, Size}; + Instruction *LastInst = StartInst; + BasicBlock::iterator BI(StartInst); + + unsigned Count = 0; + for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) { + if (!isa<DbgInfoIntrinsic>(*BI)) + ++Count; + + if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc))) + continue; + + if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { + // If the instruction is readnone, ignore it, otherwise bail out. We + // don't even allow readonly here because we don't want something like: + // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). + if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) + break; + continue; + } + + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + if (!NextStore->isSimple()) + break; + + // Check to see if this store is to a constant offset from the start ptr. + Optional<int64_t> Offset = + isPointerOffset(StartPtr, NextStore->getPointerOperand(), *DL); + if (!Offset) + break; + + if (!IB.addStore(*Offset, NextStore, DL)) + break; + LastInst = NextStore; + } else { + MemSetInst *MSI = cast<MemSetInst>(BI); + + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) + break; + + if (!isa<ConstantInt>(MSI->getValue())) + break; + + // Check to see if this store is to a constant offset from the start ptr. + Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), *DL); + if (!Offset) + break; + + if (!IB.addMemSet(*Offset, MSI)) + break; + LastInst = MSI; + } + } + return LastInst; } bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { @@ -127,8 +405,23 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, uint64_t Size) { + auto SetTagZeroFunc = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero); + auto StgpFunc = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp); + + InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc); + bool LittleEndian = + Triple(AI->getModule()->getTargetTriple()).isLittleEndian(); + // Current implementation of initializer merging assumes little endianness. + if (MergeInit && !F->hasOptNone() && LittleEndian) { + LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI + << ", size = " << Size << "\n"); + InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB); + } + IRBuilder<> IRB(InsertBefore); - IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); + IB.generate(IRB); } void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore, @@ -166,7 +459,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( } void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { - unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize); + const Align NewAlignment = + max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize); Info.AI->setAlignment(NewAlignment); uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; @@ -179,7 +473,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { Info.AI->isArrayAllocation() ? ArrayType::get( Info.AI->getAllocatedType(), - dyn_cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue()) + cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue()) : Info.AI->getAllocatedType(); Type *PaddingType = ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size); @@ -187,7 +481,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { auto *NewAI = new AllocaInst( TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI); NewAI->takeName(Info.AI); - NewAI->setAlignment(Info.AI->getAlignment()); + NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment())); NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca()); NewAI->setSwiftError(Info.AI->isSwiftError()); NewAI->copyMetadata(*Info.AI); @@ -198,6 +492,24 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { Info.AI = NewAI; } +// Helper function to check for post-dominance. +static bool postDominates(const PostDominatorTree *PDT, const IntrinsicInst *A, + const IntrinsicInst *B) { + const BasicBlock *ABB = A->getParent(); + const BasicBlock *BBB = B->getParent(); + + if (ABB != BBB) + return PDT->dominates(ABB, BBB); + + for (const Instruction &I : *ABB) { + if (&I == B) + return true; + if (&I == A) + return false; + } + llvm_unreachable("Corrupt instruction list"); +} + // FIXME: check for MTE extension bool AArch64StackTagging::runOnFunction(Function &Fn) { if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag)) @@ -205,6 +517,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { F = &Fn; DL = &Fn.getParent()->getDataLayout(); + if (MergeInit) + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order SmallVector<Instruction *, 8> RetVec; @@ -270,23 +584,31 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (NumInterestingAllocas == 0) return true; + std::unique_ptr<DominatorTree> DeleteDT; + DominatorTree *DT = nullptr; + if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DT = &P->getDomTree(); + + if (DT == nullptr && (NumInterestingAllocas > 1 || + !F->hasFnAttribute(Attribute::OptimizeNone))) { + DeleteDT = std::make_unique<DominatorTree>(*F); + DT = DeleteDT.get(); + } + + std::unique_ptr<PostDominatorTree> DeletePDT; + PostDominatorTree *PDT = nullptr; + if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>()) + PDT = &P->getPostDomTree(); + + if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) { + DeletePDT = std::make_unique<PostDominatorTree>(*F); + PDT = DeletePDT.get(); + } + SetTagFunc = Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); - // Compute DT only if the function has the attribute, there are more than 1 - // interesting allocas, and it is not available for free. - Instruction *Base; - if (NumInterestingAllocas > 1) { - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - if (DTWP) { - Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree()); - } else { - DominatorTree DT(*F); - Base = insertBaseTaggedPointer(Allocas, &DT); - } - } else { - Base = insertBaseTaggedPointer(Allocas, nullptr); - } + Instruction *Base = insertBaseTaggedPointer(Allocas, DT); for (auto &I : Allocas) { const AllocaInfo &Info = I.second; @@ -309,11 +631,37 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 && Info.LifetimeEnd.size() == 1) { IntrinsicInst *Start = Info.LifetimeStart[0]; + IntrinsicInst *End = Info.LifetimeEnd[0]; uint64_t Size = dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); Size = alignTo(Size, kTagGranuleSize); tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size); - untagAlloca(AI, Info.LifetimeEnd[0], Size); + // We need to ensure that if we tag some object, we certainly untag it + // before the function exits. + if (PDT != nullptr && postDominates(PDT, End, Start)) { + untagAlloca(AI, End, Size); + } else { + SmallVector<Instruction *, 8> ReachableRetVec; + unsigned NumCoveredExits = 0; + for (auto &RI : RetVec) { + if (!isPotentiallyReachable(Start, RI, nullptr, DT)) + continue; + ReachableRetVec.push_back(RI); + if (DT != nullptr && DT->dominates(End, RI)) + ++NumCoveredExits; + } + // If there's a mix of covered and non-covered exits, just put the untag + // on exits, so we avoid the redundancy of untagging twice. + if (NumCoveredExits == ReachableRetVec.size()) { + untagAlloca(AI, End, Size); + } else { + for (auto &RI : ReachableRetVec) + untagAlloca(AI, RI, Size); + // We may have inserted untag outside of the lifetime interval. + // Remove the lifetime end call for this alloca. + End->eraseFromParent(); + } + } } else { uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy()); diff --git a/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp new file mode 100644 index 000000000000..3cc556f74aea --- /dev/null +++ b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -0,0 +1,209 @@ +//===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + +#include "AArch64.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-stack-tagging-pre-ra" + +enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways }; + +cl::opt<UncheckedLdStMode> ClUncheckedLdSt( + "stack-tagging-unchecked-ld-st", cl::Hidden, + cl::init(UncheckedSafe), + cl::desc( + "Unconditionally apply unchecked-ld-st optimization (even for large " + "stack frames, or in the presence of variable sized allocas)."), + cl::values( + clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"), + clEnumValN( + UncheckedSafe, "safe", + "apply unchecked-ld-st when the target is definitely within range"), + clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st"))); + +namespace { + +class AArch64StackTaggingPreRA : public MachineFunctionPass { + MachineFunction *MF; + AArch64FunctionInfo *AFI; + MachineFrameInfo *MFI; + MachineRegisterInfo *MRI; + const AArch64RegisterInfo *TRI; + const AArch64InstrInfo *TII; + + SmallVector<MachineInstr*, 16> ReTags; + +public: + static char ID; + AArch64StackTaggingPreRA() : MachineFunctionPass(ID) { + initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry()); + } + + bool mayUseUncheckedLoadStore(); + void uncheckUsesOf(unsigned TaggedReg, int FI); + void uncheckLoadsAndStores(); + + bool runOnMachineFunction(MachineFunction &Func) override; + StringRef getPassName() const override { + return "AArch64 Stack Tagging PreRA"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // end anonymous namespace + +char AArch64StackTaggingPreRA::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra", + "AArch64 Stack Tagging PreRA Pass", false, false) +INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra", + "AArch64 Stack Tagging PreRA Pass", false, false) + +FunctionPass *llvm::createAArch64StackTaggingPreRAPass() { + return new AArch64StackTaggingPreRA(); +} + +static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) { + switch (Opcode) { + case AArch64::LDRWui: + case AArch64::LDRSHWui: + case AArch64::LDRXui: + case AArch64::LDRBui: + case AArch64::LDRBBui: + case AArch64::LDRHui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::STRWui: + case AArch64::STRXui: + case AArch64::STRBui: + case AArch64::STRBBui: + case AArch64::STRHui: + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + return true; + default: + return false; + } +} + +bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() { + if (ClUncheckedLdSt == UncheckedNever) + return false; + else if (ClUncheckedLdSt == UncheckedAlways) + return true; + + // This estimate can be improved if we had harder guarantees about stack frame + // layout. With LocalStackAllocation we can estimate SP offset to any + // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged + // objects ahead of non-tagged ones, but that's not always desirable. + // + // Underestimating SP offset here may require the use of LDG to materialize + // the tagged address of the stack slot, along with a scratch register + // allocation (post-regalloc!). + // + // For now we do the safe thing here and require that the entire stack frame + // is within range of the shortest of the unchecked instructions. + unsigned FrameSize = 0; + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) + FrameSize += MFI->getObjectSize(i); + bool EntireFrameReachableFromSP = FrameSize < 0xf00; + return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP; +} + +void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) { + for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end(); + UI != E;) { + MachineInstr *UseI = &*(UI++); + if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) { + // FI operand is always the one before the immediate offset. + unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1; + if (UseI->getOperand(OpIdx).isReg() && + UseI->getOperand(OpIdx).getReg() == TaggedReg) { + UseI->getOperand(OpIdx).ChangeToFrameIndex(FI); + UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED); + } + } else if (UseI->isCopy() && + Register::isVirtualRegister(UseI->getOperand(0).getReg())) { + uncheckUsesOf(UseI->getOperand(0).getReg(), FI); + } + } +} + +void AArch64StackTaggingPreRA::uncheckLoadsAndStores() { + for (auto *I : ReTags) { + unsigned TaggedReg = I->getOperand(0).getReg(); + int FI = I->getOperand(1).getIndex(); + uncheckUsesOf(TaggedReg, FI); + } +} + +bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + MRI = &MF->getRegInfo(); + AFI = MF->getInfo<AArch64FunctionInfo>(); + TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo()); + TRI = static_cast<const AArch64RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + MFI = &MF->getFrameInfo(); + ReTags.clear(); + + assert(MRI->isSSA()); + + LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n" + << "********** Function: " << MF->getName() << '\n'); + + SmallSetVector<int, 8> TaggedSlots; + for (auto &BB : *MF) { + for (auto &I : BB) { + if (I.getOpcode() == AArch64::TAGPstack) { + ReTags.push_back(&I); + int FI = I.getOperand(1).getIndex(); + TaggedSlots.insert(FI); + // There should be no offsets in TAGP yet. + assert(I.getOperand(2).getImm() == 0); + } + } + } + + if (ReTags.empty()) + return false; + + if (mayUseUncheckedLoadStore()) + uncheckLoadsAndStores(); + + return true; +} diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 0e84a00df006..5deb601822b8 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -151,7 +151,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { int64_t Offset; if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) && BaseOp->isReg()) { - unsigned BaseReg = BaseOp->getReg(); + Register BaseReg = BaseOp->getReg(); if (PrevBaseReg == BaseReg) { // If this block can take STPs, skip ahead to the next block. if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 3bc89b91c3f7..558bea368eff 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -71,19 +71,22 @@ void AArch64Subtarget::initializeProperties() { case CortexA35: break; case CortexA53: - PrefFunctionAlignment = 3; + PrefFunctionLogAlignment = 3; break; case CortexA55: break; case CortexA57: MaxInterleaveFactor = 4; - PrefFunctionAlignment = 4; + PrefFunctionLogAlignment = 4; + break; + case CortexA65: + PrefFunctionLogAlignment = 3; break; case CortexA72: case CortexA73: case CortexA75: case CortexA76: - PrefFunctionAlignment = 4; + PrefFunctionLogAlignment = 4; break; case Cyclone: CacheLineSize = 64; @@ -94,14 +97,14 @@ void AArch64Subtarget::initializeProperties() { case ExynosM1: MaxInterleaveFactor = 4; MaxJumpTableSize = 8; - PrefFunctionAlignment = 4; - PrefLoopAlignment = 3; + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 3; break; case ExynosM3: MaxInterleaveFactor = 4; MaxJumpTableSize = 20; - PrefFunctionAlignment = 5; - PrefLoopAlignment = 4; + PrefFunctionLogAlignment = 5; + PrefLoopLogAlignment = 4; break; case Falkor: MaxInterleaveFactor = 4; @@ -122,6 +125,12 @@ void AArch64Subtarget::initializeProperties() { // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case NeoverseE1: + PrefFunctionLogAlignment = 3; + break; + case NeoverseN1: + PrefFunctionLogAlignment = 4; + break; case Saphira: MaxInterleaveFactor = 4; // FIXME: remove this to enable 64-bit SLP if performance looks good. @@ -129,8 +138,8 @@ void AArch64Subtarget::initializeProperties() { break; case ThunderX2T99: CacheLineSize = 64; - PrefFunctionAlignment = 3; - PrefLoopAlignment = 2; + PrefFunctionLogAlignment = 3; + PrefLoopLogAlignment = 2; MaxInterleaveFactor = 4; PrefetchDistance = 128; MinPrefetchStride = 1024; @@ -143,15 +152,15 @@ void AArch64Subtarget::initializeProperties() { case ThunderXT81: case ThunderXT83: CacheLineSize = 128; - PrefFunctionAlignment = 3; - PrefLoopAlignment = 2; + PrefFunctionLogAlignment = 3; + PrefLoopLogAlignment = 2; // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; case TSV110: CacheLineSize = 64; - PrefFunctionAlignment = 4; - PrefLoopAlignment = 2; + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 2; break; } } @@ -187,7 +196,7 @@ const CallLowering *AArch64Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } -const InstructionSelector *AArch64Subtarget::getInstructionSelector() const { +InstructionSelector *AArch64Subtarget::getInstructionSelector() const { return InstSelector.get(); } @@ -201,7 +210,7 @@ const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { /// Find the target operand flags that describe how a global value should be /// referenced for the current subtarget. -unsigned char +unsigned AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // MachO large model always goes via a GOT, simply to get a single 8-byte @@ -224,10 +233,17 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, GV->hasExternalWeakLinkage()) return AArch64II::MO_GOT; + // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate + // that their nominal addresses are tagged and outside of the code model. In + // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the + // tag if necessary based on MO_TAGGED. + if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) + return AArch64II::MO_NC | AArch64II::MO_TAGGED; + return AArch64II::MO_NO_FLAG; } -unsigned char AArch64Subtarget::classifyGlobalFunctionReference( +unsigned AArch64Subtarget::classifyGlobalFunctionReference( const GlobalValue *GV, const TargetMachine &TM) const { // MachO large model always goes via a GOT, because we don't have the // relocations available to do anything else.. @@ -275,7 +291,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { - return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr; + return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; } void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 0c84cfb8329a..f3212fae8e5e 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -42,6 +42,7 @@ public: CortexA53, CortexA55, CortexA57, + CortexA65, CortexA72, CortexA73, CortexA75, @@ -51,6 +52,8 @@ public: ExynosM3, Falkor, Kryo, + NeoverseE1, + NeoverseN1, Saphira, ThunderX2T99, ThunderX, @@ -113,6 +116,7 @@ protected: bool HasTRACEV8_4 = false; bool HasAM = false; bool HasSEL2 = false; + bool HasPMU = false; bool HasTLB_RMI = false; bool HasFMI = false; bool HasRCPC_IMMO = false; @@ -134,6 +138,7 @@ protected: bool HasBTI = false; bool HasRandGen = false; bool HasMTE = false; + bool HasTME = false; // Arm SVE2 extensions bool HasSVE2AES = false; @@ -141,6 +146,10 @@ protected: bool HasSVE2SHA3 = false; bool HasSVE2BitPerm = false; + // Future architecture extensions. + bool HasETE = false; + bool HasTRBE = false; + // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -183,14 +192,15 @@ protected: bool UseEL1ForTP = false; bool UseEL2ForTP = false; bool UseEL3ForTP = false; + bool AllowTaggedGlobals = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; uint16_t PrefetchDistance = 0; uint16_t MinPrefetchStride = 1; unsigned MaxPrefetchIterationsAhead = UINT_MAX; - unsigned PrefFunctionAlignment = 0; - unsigned PrefLoopAlignment = 0; + unsigned PrefFunctionLogAlignment = 0; + unsigned PrefLoopLogAlignment = 0; unsigned MaxJumpTableSize = 0; unsigned WideningBaseCost = 0; @@ -247,7 +257,7 @@ public: return &getInstrInfo()->getRegisterInfo(); } const CallLowering *getCallLowering() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; const Triple &getTargetTriple() const { return TargetTriple; } @@ -344,14 +354,16 @@ public: unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; } - unsigned getCacheLineSize() const { return CacheLineSize; } - unsigned getPrefetchDistance() const { return PrefetchDistance; } - unsigned getMinPrefetchStride() const { return MinPrefetchStride; } - unsigned getMaxPrefetchIterationsAhead() const { + unsigned getCacheLineSize() const override { return CacheLineSize; } + unsigned getPrefetchDistance() const override { return PrefetchDistance; } + unsigned getMinPrefetchStride() const override { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const override { return MaxPrefetchIterationsAhead; } - unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; } - unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + unsigned getPrefFunctionLogAlignment() const { + return PrefFunctionLogAlignment; + } + unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; } unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } @@ -380,6 +392,7 @@ public: bool hasBTI() const { return HasBTI; } bool hasRandGen() const { return HasRandGen; } bool hasMTE() const { return HasMTE; } + bool hasTME() const { return HasTME; } // Arm SVE2 extensions bool hasSVE2AES() const { return HasSVE2AES; } bool hasSVE2SM4() const { return HasSVE2SM4; } @@ -399,6 +412,8 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + bool isTargetILP32() const { return TargetTriple.isArch32Bit(); } + bool useAA() const override { return UseAA; } bool hasVH() const { return HasVH; } @@ -421,10 +436,17 @@ public: bool hasTRACEV8_4() const { return HasTRACEV8_4; } bool hasAM() const { return HasAM; } bool hasSEL2() const { return HasSEL2; } + bool hasPMU() const { return HasPMU; } bool hasTLB_RMI() const { return HasTLB_RMI; } bool hasFMI() const { return HasFMI; } bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } + bool addrSinkUsingGEPs() const override { + // Keeping GEPs inbounds is important for exploiting AArch64 + // addressing-modes in ILP32 mode. + return useAA() || isTargetILP32(); + } + bool useSmallAddressing() const { switch (TLInfo.getTargetMachine().getCodeModel()) { case CodeModel::Kernel: @@ -443,11 +465,11 @@ public: /// ClassifyGlobalReference - Find the target operand flags that describe /// how a global value should be referenced for the current subtarget. - unsigned char ClassifyGlobalReference(const GlobalValue *GV, - const TargetMachine &TM) const; + unsigned ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM) const; - unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, - const TargetMachine &TM) const; + unsigned classifyGlobalFunctionReference(const GlobalValue *GV, + const TargetMachine &TM) const; void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index 536a6591478b..05249a4ea6a8 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -612,6 +612,7 @@ def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>; def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>; def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>; def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>; +def : ROSysReg<"ID_MMFR5_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b110>; // Trace registers // Op0 Op1 CRn CRm Op2 @@ -1321,6 +1322,12 @@ def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>; def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>; } // FeatureSEL2 +// v8.4a PMU registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeaturePMU} }] in { +def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; +} // FeaturePMU + // v8.4a RAS registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeatureRASv8_4} }] in { @@ -1452,14 +1459,37 @@ let Requires = [{ {AArch64::FeatureMTE} }] in { def : RWSysReg<"TCO", 0b11, 0b011, 0b0100, 0b0010, 0b111>; def : RWSysReg<"GCR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b110>; def : RWSysReg<"RGSR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b101>; -def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0110, 0b0101, 0b000>; -def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0110, 0b0101, 0b000>; -def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0110, 0b0110, 0b000>; -def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0110, 0b0110, 0b000>; -def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0110, 0b0110, 0b001>; +def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b001>; def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>; } // HasMTE +// Embedded Trace Extension R/W System registers +let Requires = [{ {AArch64::FeatureETE} }] in { +// Name Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRCRSR", 0b10, 0b001, 0b0000, 0b1010, 0b000>; +// TRCEXTINSELR0 has the same encoding as ETM TRCEXTINSELR +def : RWSysReg<"TRCEXTINSELR0", 0b10, 0b001, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"TRCEXTINSELR1", 0b10, 0b001, 0b0000, 0b1001, 0b100>; +def : RWSysReg<"TRCEXTINSELR2", 0b10, 0b001, 0b0000, 0b1010, 0b100>; +def : RWSysReg<"TRCEXTINSELR3", 0b10, 0b001, 0b0000, 0b1011, 0b100>; +} // FeatureETE + +// Trace Buffer Extension System registers +let Requires = [{ {AArch64::FeatureTRBE} }] in { +// Name Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b000>; +def : RWSysReg<"TRBPTR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b001>; +def : RWSysReg<"TRBBASER_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b010>; +def : RWSysReg<"TRBSR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b011>; +def : RWSysReg<"TRBMAR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b100>; +def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>; +def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>; +} // FeatureTRBE + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::ProcCyclone} }] in diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 865461480499..b3ed96e815be 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -157,6 +157,8 @@ extern "C" void LLVMInitializeAArch64Target() { RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget()); RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target()); + RegisterTargetMachine<AArch64leTargetMachine> W(getTheARM64_32Target()); + RegisterTargetMachine<AArch64leTargetMachine> V(getTheAArch64_32Target()); auto PR = PassRegistry::getPassRegistry(); initializeGlobalISel(*PR); initializeAArch64A53Fix835769Pass(*PR); @@ -180,6 +182,7 @@ extern "C" void LLVMInitializeAArch64Target() { initializeLDTLSCleanupPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); + initializeAArch64StackTaggingPreRAPass(*PR); } //===----------------------------------------------------------------------===// @@ -187,11 +190,11 @@ extern "C" void LLVMInitializeAArch64Target() { //===----------------------------------------------------------------------===// static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) - return llvm::make_unique<AArch64_MachoTargetObjectFile>(); + return std::make_unique<AArch64_MachoTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) - return llvm::make_unique<AArch64_COFFTargetObjectFile>(); + return std::make_unique<AArch64_COFFTargetObjectFile>(); - return llvm::make_unique<AArch64_ELFTargetObjectFile>(); + return std::make_unique<AArch64_ELFTargetObjectFile>(); } // Helper function to build a DataLayout string @@ -200,8 +203,11 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { if (Options.getABIName() == "ilp32") return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128"; - if (TT.isOSBinFormatMachO()) + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::aarch64_32) + return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128"; return "e-m:o-i64:64-i128:128-n32:64-S128"; + } if (TT.isOSBinFormatCOFF()) return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"; if (LittleEndian) @@ -277,8 +283,11 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, this->Options.TrapUnreachable = true; } - // Enable GlobalISel at or below EnableGlobalISelAt0. - if (getOptLevel() <= EnableGlobalISelAtO) { + // Enable GlobalISel at or below EnableGlobalISelAt0, unless this is + // MachO/CodeModel::Large, which GlobalISel does not support. + if (getOptLevel() <= EnableGlobalISelAtO && + TT.getArch() != Triple::aarch64_32 && + !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) { setGlobalISel(true); setGlobalISelAbort(GlobalISelAbortMode::Disable); } @@ -310,7 +319,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, + I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle); } return I.get(); @@ -448,7 +457,8 @@ void AArch64PassConfig::addIRPasses() { addPass(createLICMPass()); } - addPass(createAArch64StackTaggingPass()); + addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() != + CodeGenOpt::None)); } // Pass Pipeline Configuration @@ -502,7 +512,8 @@ bool AArch64PassConfig::addIRTranslator() { } void AArch64PassConfig::addPreLegalizeMachineIR() { - addPass(createAArch64PreLegalizeCombiner()); + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAArch64PreLegalizeCombiner(IsOptNone)); } bool AArch64PassConfig::addLegalizeMachineIR() { @@ -516,9 +527,7 @@ bool AArch64PassConfig::addRegBankSelect() { } void AArch64PassConfig::addPreGlobalInstructionSelect() { - // Workaround the deficiency of the fast register allocator. - if (TM->getOptLevel() == CodeGenOpt::None) - addPass(new Localizer()); + addPass(new Localizer()); } bool AArch64PassConfig::addGlobalInstructionSelect() { @@ -540,6 +549,8 @@ bool AArch64PassConfig::addILPOpts() { if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); addPass(createAArch64SIMDInstrOptPass()); + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createAArch64StackTaggingPreRAPass()); return true; } diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 1c3d5d0743ad..54562094fcf5 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -59,8 +59,8 @@ MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol( } const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( - const MCSymbol *Sym, const MCValue &MV, int64_t Offset, - MachineModuleInfo *MMI, MCStreamer &Streamer) const { + const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV, + int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const { assert((Offset+MV.getConstant() == 0) && "Arch64 does not support GOT PC rel with extra offset"); // On ARM64 Darwin, we can reference symbols with foo@GOT-., which diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 7ead363d42fe..1cb4c028c80d 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -35,7 +35,8 @@ public: const TargetMachine &TM, MachineModuleInfo *MMI) const override; - const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV, + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a4b78f2a7d6b..dc916a7b3407 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } +AArch64TTIImpl::TTI::MemCmpExpansionOptions +AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.AllowOverlappingLoads = !ST->requiresStrictAlign(); + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = Options.MaxNumLoads; + // TODO: Though vector loads usually perform well on AArch64, in some targets + // they may wake up the FP unit, which raises the power consumption. Perhaps + // they could be used with no holds barred (-O3). + Options.LoadSizes = {8, 4, 2, 1}; + return Options; +} + int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { @@ -879,22 +892,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( return Considerable; } -unsigned AArch64TTIImpl::getCacheLineSize() { - return ST->getCacheLineSize(); -} - -unsigned AArch64TTIImpl::getPrefetchDistance() { - return ST->getPrefetchDistance(); -} - -unsigned AArch64TTIImpl::getMinPrefetchStride() { - return ST->getMinPrefetchStride(); -} - -unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { - return ST->getMaxPrefetchIterationsAhead(); -} - bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 10c15a139b4c..32c59f41e1c3 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -85,7 +85,8 @@ public: bool enableInterleavedAccessVectorization() { return true; } - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 32; @@ -130,6 +131,9 @@ public: int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I = nullptr); @@ -153,14 +157,6 @@ public: shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); - unsigned getCacheLineSize(); - - unsigned getPrefetchDistance(); - - unsigned getMinPrefetchStride(); - - unsigned getMaxPrefetchIterationsAhead(); - bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index f4c55d48d215..4fb409f020d9 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -935,48 +935,34 @@ public: return false; } - bool isMovZSymbolG3() const { - return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); + bool isMovWSymbolG3() const { + return isMovWSymbol({AArch64MCExpr::VK_ABS_G3, AArch64MCExpr::VK_PREL_G3}); } - bool isMovZSymbolG2() const { - return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, - AArch64MCExpr::VK_TPREL_G2, - AArch64MCExpr::VK_DTPREL_G2}); - } - - bool isMovZSymbolG1() const { - return isMovWSymbol({ - AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, - AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1, - AArch64MCExpr::VK_DTPREL_G1, - }); - } - - bool isMovZSymbolG0() const { - return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, - AArch64MCExpr::VK_TPREL_G0, - AArch64MCExpr::VK_DTPREL_G0}); - } - - bool isMovKSymbolG3() const { - return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); - } - - bool isMovKSymbolG2() const { - return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC); + bool isMovWSymbolG2() const { + return isMovWSymbol( + {AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, + AArch64MCExpr::VK_ABS_G2_NC, AArch64MCExpr::VK_PREL_G2, + AArch64MCExpr::VK_PREL_G2_NC, AArch64MCExpr::VK_TPREL_G2, + AArch64MCExpr::VK_DTPREL_G2}); } - bool isMovKSymbolG1() const { - return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC, - AArch64MCExpr::VK_TPREL_G1_NC, - AArch64MCExpr::VK_DTPREL_G1_NC}); + bool isMovWSymbolG1() const { + return isMovWSymbol( + {AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, + AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_PREL_G1, + AArch64MCExpr::VK_PREL_G1_NC, AArch64MCExpr::VK_GOTTPREL_G1, + AArch64MCExpr::VK_TPREL_G1, AArch64MCExpr::VK_TPREL_G1_NC, + AArch64MCExpr::VK_DTPREL_G1, AArch64MCExpr::VK_DTPREL_G1_NC}); } - bool isMovKSymbolG0() const { + bool isMovWSymbolG0() const { return isMovWSymbol( - {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, - AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC}); + {AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, + AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_PREL_G0, + AArch64MCExpr::VK_PREL_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, + AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_TPREL_G0_NC, + AArch64MCExpr::VK_DTPREL_G0, AArch64MCExpr::VK_DTPREL_G0_NC}); } template<int RegWidth, int Shift> @@ -1814,7 +1800,7 @@ public: static std::unique_ptr<AArch64Operand> CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Token, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Token, Ctx); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->Tok.IsSuffix = IsSuffix; @@ -1829,7 +1815,7 @@ public: AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { - auto Op = make_unique<AArch64Operand>(k_Register, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx); Op->Reg.RegNum = RegNum; Op->Reg.Kind = Kind; Op->Reg.ElementWidth = 0; @@ -1861,7 +1847,7 @@ public: CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements, unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.NumElements = NumElements; @@ -1874,7 +1860,7 @@ public: static std::unique_ptr<AArch64Operand> CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_VectorIndex, Ctx); Op->VectorIndex.Val = Idx; Op->StartLoc = S; Op->EndLoc = E; @@ -1883,7 +1869,7 @@ public: static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Immediate, Ctx); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -1894,7 +1880,7 @@ public: unsigned ShiftAmount, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_ShiftedImm, Ctx); Op->ShiftedImm .Val = Val; Op->ShiftedImm.ShiftAmount = ShiftAmount; Op->StartLoc = S; @@ -1904,7 +1890,7 @@ public: static std::unique_ptr<AArch64Operand> CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_CondCode, Ctx); Op->CondCode.Code = Code; Op->StartLoc = S; Op->EndLoc = E; @@ -1913,7 +1899,7 @@ public: static std::unique_ptr<AArch64Operand> CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_FPImm, Ctx); Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue(); Op->FPImm.IsExact = IsExact; Op->StartLoc = S; @@ -1925,7 +1911,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx); Op->Barrier.Val = Val; Op->Barrier.Data = Str.data(); Op->Barrier.Length = Str.size(); @@ -1939,7 +1925,7 @@ public: uint32_t MSRReg, uint32_t PStateField, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_SysReg, Ctx); Op->SysReg.Data = Str.data(); Op->SysReg.Length = Str.size(); Op->SysReg.MRSReg = MRSReg; @@ -1952,7 +1938,7 @@ public: static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_SysCR, Ctx); Op->SysCRImm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -1963,7 +1949,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Prefetch, Ctx); Op->Prefetch.Val = Val; Op->Barrier.Data = Str.data(); Op->Barrier.Length = Str.size(); @@ -1976,7 +1962,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_PSBHint, Ctx); Op->PSBHint.Val = Val; Op->PSBHint.Data = Str.data(); Op->PSBHint.Length = Str.size(); @@ -1989,7 +1975,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_BTIHint, Ctx); Op->BTIHint.Val = Val << 1 | 32; Op->BTIHint.Data = Str.data(); Op->BTIHint.Length = Str.size(); @@ -2001,7 +1987,7 @@ public: static std::unique_ptr<AArch64Operand> CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_ShiftExtend, Ctx); Op->ShiftExtend.Type = ShOp; Op->ShiftExtend.Amount = Val; Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount; @@ -2840,7 +2826,7 @@ static const struct Extension { {"sve2-aes", {AArch64::FeatureSVE2AES}}, {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, - {"bitperm", {AArch64::FeatureSVE2BitPerm}}, + {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}}, // FIXME: Unsupported extensions {"pan", {}}, {"lor", {}}, @@ -3260,6 +3246,13 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { .Case("abs_g0", AArch64MCExpr::VK_ABS_G0) .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S) .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC) + .Case("prel_g3", AArch64MCExpr::VK_PREL_G3) + .Case("prel_g2", AArch64MCExpr::VK_PREL_G2) + .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC) + .Case("prel_g1", AArch64MCExpr::VK_PREL_G1) + .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC) + .Case("prel_g0", AArch64MCExpr::VK_PREL_G0) + .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC) .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2) .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1) .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC) @@ -5283,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { auto parseOp = [&]() -> bool { SMLoc L = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; if (check(getParser().parseExpression(Expr), L, "expected expression")) return true; const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); @@ -5542,43 +5535,43 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, switch (Kind) { default: return Match_InvalidOperand; - case MCK__35_0: + case MCK__HASH_0: ExpectedVal = 0; break; - case MCK__35_1: + case MCK__HASH_1: ExpectedVal = 1; break; - case MCK__35_12: + case MCK__HASH_12: ExpectedVal = 12; break; - case MCK__35_16: + case MCK__HASH_16: ExpectedVal = 16; break; - case MCK__35_2: + case MCK__HASH_2: ExpectedVal = 2; break; - case MCK__35_24: + case MCK__HASH_24: ExpectedVal = 24; break; - case MCK__35_3: + case MCK__HASH_3: ExpectedVal = 3; break; - case MCK__35_32: + case MCK__HASH_32: ExpectedVal = 32; break; - case MCK__35_4: + case MCK__HASH_4: ExpectedVal = 4; break; - case MCK__35_48: + case MCK__HASH_48: ExpectedVal = 48; break; - case MCK__35_6: + case MCK__HASH_6: ExpectedVal = 6; break; - case MCK__35_64: + case MCK__HASH_64: ExpectedVal = 64; break; - case MCK__35_8: + case MCK__HASH_8: ExpectedVal = 8; break; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 6418211a4f55..21ce5785ea5e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -153,9 +153,8 @@ static unsigned AdrImmBits(unsigned Value) { static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, uint64_t Value, MCContext &Ctx, const Triple &TheTriple, bool IsResolved) { - unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast<int64_t>(Value); - switch (Kind) { + switch (Fixup.getTargetKind()) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: @@ -574,7 +573,7 @@ public: case MCCFIInstruction::OpDefCfa: { // Defines a frame pointer. unsigned XReg = - getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)); + getXRegFromWReg(*MRI.getLLVMRegNum(Inst.getRegister(), true)); // Other CFA registers than FP are not supported by compact unwind. // Fallback on DWARF. @@ -593,8 +592,8 @@ public: assert(FPPush.getOperation() == MCCFIInstruction::OpOffset && "Frame pointer not pushed!"); - unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true); - unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true); + unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true); + unsigned FPReg = *MRI.getLLVMRegNum(FPPush.getRegister(), true); LRReg = getXRegFromWReg(LRReg); FPReg = getXRegFromWReg(FPReg); @@ -615,14 +614,14 @@ public: case MCCFIInstruction::OpOffset: { // Registers are saved in pairs. We expect there to be two consecutive // `.cfi_offset' instructions with the appropriate registers specified. - unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); + unsigned Reg1 = *MRI.getLLVMRegNum(Inst.getRegister(), true); if (i + 1 == e) return CU::UNWIND_ARM64_MODE_DWARF; const MCCFIInstruction &Inst2 = Instrs[++i]; if (Inst2.getOperation() != MCCFIInstruction::OpOffset) return CU::UNWIND_ARM64_MODE_DWARF; - unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); + unsigned Reg2 = *MRI.getLLVMRegNum(Inst2.getRegister(), true); // N.B. The encodings must be in register number order, and the X // registers before the D registers. diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index c871e2c62eac..0fd1ca187be7 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) static bool isNonILP32reloc(const MCFixup &Fixup, AArch64MCExpr::VariantKind RefKind, MCContext &Ctx) { - if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw) + if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw) return false; switch (RefKind) { case AArch64MCExpr::VK_ABS_G3: @@ -120,7 +120,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, "Should only be expression-level modifiers here"); if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; @@ -184,7 +184,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } else { if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) return ELF::R_AARCH64_NONE; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case FK_NONE: return ELF::R_AARCH64_NONE; case FK_Data_1: @@ -394,6 +394,20 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(MOVW_SABS_G0); if (RefKind == AArch64MCExpr::VK_ABS_G0_NC) return R_CLS(MOVW_UABS_G0_NC); + if (RefKind == AArch64MCExpr::VK_PREL_G3) + return ELF::R_AARCH64_MOVW_PREL_G3; + if (RefKind == AArch64MCExpr::VK_PREL_G2) + return ELF::R_AARCH64_MOVW_PREL_G2; + if (RefKind == AArch64MCExpr::VK_PREL_G2_NC) + return ELF::R_AARCH64_MOVW_PREL_G2_NC; + if (RefKind == AArch64MCExpr::VK_PREL_G1) + return R_CLS(MOVW_PREL_G1); + if (RefKind == AArch64MCExpr::VK_PREL_G1_NC) + return ELF::R_AARCH64_MOVW_PREL_G1_NC; + if (RefKind == AArch64MCExpr::VK_PREL_G0) + return R_CLS(MOVW_PREL_G0); + if (RefKind == AArch64MCExpr::VK_PREL_G0_NC) + return R_CLS(MOVW_PREL_G0_NC); if (RefKind == AArch64MCExpr::VK_DTPREL_G2) return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2; if (RefKind == AArch64MCExpr::VK_DTPREL_G1) @@ -434,5 +448,5 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) { - return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32); + return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index d0a544273b8b..1a16468484ad 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -172,7 +172,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, int ImmS = MI->getOperand(4).getImm(); if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) && - (ImmR == 0 || ImmS < ImmR)) { + (ImmR == 0 || ImmS < ImmR) && + STI.getFeatureBits()[AArch64::HasV8_2aOps]) { // BFC takes precedence over its entire range, sligtly differently to BFI. int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; int LSB = (BitWidth - ImmR) % BitWidth; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index ecff1ab0a8b3..5926a4f81616 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -30,7 +30,7 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant( cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"), clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"))); -AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { +AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) { // We prefer NEON instructions to be printed in the short, Apple-specific // form when targeting Darwin. AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant; @@ -39,7 +39,8 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { PrivateLabelPrefix = "L"; SeparatorString = "%%"; CommentString = ";"; - CodePointerSize = CalleeSaveStackSlotSize = 8; + CalleeSaveStackSlotSize = 8; + CodePointerSize = IsILP32 ? 4 : 8; AlignmentIsInBytes = false; UsesELFSectionDirectiveForBSS = true; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 36ae92afc8c1..7274ae79f74a 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -23,7 +23,7 @@ class Target; class Triple; struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin { - explicit AArch64MCAsmInfoDarwin(); + explicit AArch64MCAsmInfoDarwin(bool IsILP32); const MCExpr * getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const override; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 0a529321edc8..548e399e05a3 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -42,6 +42,13 @@ StringRef AArch64MCExpr::getVariantKindName() const { case VK_ABS_G0: return ":abs_g0:"; case VK_ABS_G0_S: return ":abs_g0_s:"; case VK_ABS_G0_NC: return ":abs_g0_nc:"; + case VK_PREL_G3: return ":prel_g3:"; + case VK_PREL_G2: return ":prel_g2:"; + case VK_PREL_G2_NC: return ":prel_g2_nc:"; + case VK_PREL_G1: return ":prel_g1:"; + case VK_PREL_G1_NC: return ":prel_g1_nc:"; + case VK_PREL_G0: return ":prel_g0:"; + case VK_PREL_G0_NC: return ":prel_g0_nc:"; case VK_DTPREL_G2: return ":dtprel_g2:"; case VK_DTPREL_G1: return ":dtprel_g1:"; case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:"; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index ec9c95911628..a82ff2e91426 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -27,12 +27,13 @@ public: // symbol. E.g. direct, via the GOT, ... VK_ABS = 0x001, VK_SABS = 0x002, - VK_GOT = 0x003, - VK_DTPREL = 0x004, - VK_GOTTPREL = 0x005, - VK_TPREL = 0x006, - VK_TLSDESC = 0x007, - VK_SECREL = 0x008, + VK_PREL = 0x003, + VK_GOT = 0x004, + VK_DTPREL = 0x005, + VK_GOTTPREL = 0x006, + VK_TPREL = 0x007, + VK_TLSDESC = 0x008, + VK_SECREL = 0x009, VK_SymLocBits = 0x00f, // Variants specifying which part of the final address calculation is @@ -72,6 +73,13 @@ public: VK_ABS_G0_S = VK_SABS | VK_G0, VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC, VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC, + VK_PREL_G3 = VK_PREL | VK_G3, + VK_PREL_G2 = VK_PREL | VK_G2, + VK_PREL_G2_NC = VK_PREL | VK_G2 | VK_NC, + VK_PREL_G1 = VK_PREL | VK_G1, + VK_PREL_G1_NC = VK_PREL | VK_G1 | VK_NC, + VK_PREL_G0 = VK_PREL | VK_G0, + VK_PREL_G0_NC = VK_PREL | VK_G0 | VK_NC, VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, VK_GOT_PAGE = VK_GOT | VK_PAGE, VK_DTPREL_G2 = VK_DTPREL | VK_G2, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index df12274d9470..1d583ec0087b 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -241,7 +241,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, const Triple &TheTriple) { MCAsmInfo *MAI; if (TheTriple.isOSBinFormatMachO()) - MAI = new AArch64MCAsmInfoDarwin(); + MAI = new AArch64MCAsmInfoDarwin(TheTriple.getArch() == Triple::aarch64_32); else if (TheTriple.isWindowsMSVCEnvironment()) MAI = new AArch64MCAsmInfoMicrosoftCOFF(); else if (TheTriple.isOSBinFormatCOFF()) diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index b3ce5ef22eef..fc04d37eb362 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -54,7 +54,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); Log2Size = ~0U; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: return false; @@ -406,6 +406,6 @@ void AArch64MachObjectWriter::recordRelocation( std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32) { - return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype, + return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype, IsILP32); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index a45880a07427..aa50bd05cb71 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -120,7 +120,7 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { namespace llvm { std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() { - return llvm::make_unique<AArch64WinCOFFObjectWriter>(); + return std::make_unique<AArch64WinCOFFObjectWriter>(); } } // end namespace llvm diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td index 808e59467081..8ccf6aa675ba 100644 --- a/lib/Target/AArch64/SVEInstrFormats.td +++ b/lib/Target/AArch64/SVEInstrFormats.td @@ -279,6 +279,19 @@ let Predicates = [HasSVE] in { defm PTRUES : sve_int_ptrue<0b001, "ptrues">; } +//===----------------------------------------------------------------------===// +// SVE pattern match helpers. +//===----------------------------------------------------------------------===// + +class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, + Instruction inst> +: Pat<(vtd (op vt1:$Op1)), + (inst $Op1)>; + +class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, + ValueType vt2, ValueType vt3, Instruction inst> +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)), + (inst $Op1, $Op2, $Op3)>; //===----------------------------------------------------------------------===// // SVE Predicate Misc Group @@ -403,12 +416,12 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm> { } class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, - ZPRRegOp zprty> -: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg), - asm, "\t$Zdn, $Pg", + ZPRRegOp zprty, PPRRegOp pprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm), + asm, "\t$Zdn, $Pm", "", []>, Sched<[]> { - bits<4> Pg; + bits<4> Pm; bits<5> Zdn; let Inst{31-24} = 0b00100101; let Inst{23-22} = sz8_64; @@ -416,7 +429,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, let Inst{18-16} = opc{4-2}; let Inst{15-11} = 0b10000; let Inst{10-9} = opc{1-0}; - let Inst{8-5} = Pg; + let Inst{8-5} = Pm; let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; @@ -425,9 +438,16 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, } multiclass sve_int_count_v<bits<5> opc, string asm> { - def _H : sve_int_count_v<0b01, opc, asm, ZPR16>; - def _S : sve_int_count_v<0b10, opc, asm, ZPR32>; - def _D : sve_int_count_v<0b11, opc, asm, ZPR64>; + def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>; + def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>; + def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>; + + def : InstAlias<asm # "\t$Zdn, $Pm", + (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>; + def : InstAlias<asm # "\t$Zdn, $Pm", + (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>; + def : InstAlias<asm # "\t$Zdn, $Pm", + (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>; } class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm, @@ -609,11 +629,12 @@ multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> { //===----------------------------------------------------------------------===// class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, - RegisterClass srcRegType> + ValueType vt, RegisterClass srcRegType, + SDPatternOperator op> : I<(outs zprty:$Zd), (ins srcRegType:$Rn), asm, "\t$Zd, $Rn", "", - []>, Sched<[]> { + [(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> { bits<5> Rn; bits<5> Zd; let Inst{31-24} = 0b00000101; @@ -623,11 +644,11 @@ class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_perm_dup_r<string asm> { - def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>; - def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>; - def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>; - def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>; +multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> { + def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>; + def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>; + def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>; + def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>; def : InstAlias<"mov $Zd, $Rn", (!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>; @@ -744,7 +765,7 @@ multiclass sve2_int_perm_tbl<string asm> { } class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> -: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), +: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { @@ -758,6 +779,8 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> let Inst{15-10} = 0b001011; let Inst{9-5} = Zn; let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; } multiclass sve2_int_perm_tbx<string asm> { @@ -826,10 +849,14 @@ class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_perm_unpk<bits<2> opc, string asm> { +multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> { def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>; def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>; def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>; + + def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>; } class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -1197,10 +1224,12 @@ multiclass sve_fp_ftmad<string asm> { //===----------------------------------------------------------------------===// class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, - ZPRRegOp zprty> + ZPRRegOp zprty, + ValueType vt, ValueType vt2, SDPatternOperator op> : I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { + "", + [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> { bits<5> Zd; bits<5> Zm; bits<5> Zn; @@ -1214,10 +1243,10 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> { - def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> { + def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>; + def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>; + def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>; } //===----------------------------------------------------------------------===// @@ -1489,7 +1518,7 @@ multiclass sve_fp_fcadd<string asm> { class sve2_fp_convert_precision<bits<4> opc, string asm, ZPRRegOp zprty1, ZPRRegOp zprty2> -: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn), +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { @@ -1504,6 +1533,8 @@ class sve2_fp_convert_precision<bits<4> opc, string asm, let Inst{12-10} = Pg; let Inst{9-5} = Zn; let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; } multiclass sve2_fp_convert_down_narrow<string asm> { @@ -1998,12 +2029,14 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1, let Constraints = "$Zda = $_Zda"; let DestructiveInstType = Destructive; - let ElementSize = zprty1.ElementSize; } -multiclass sve_intx_dot<bit opc, string asm> { +multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> { def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>; def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>; + + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2028,22 +2061,27 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, let Constraints = "$Zda = $_Zda"; let DestructiveInstType = Destructive; - let ElementSize = ElementSizeNone; } -multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { +multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm, + SDPatternOperator op> { + def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { bits<1> iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>; + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))), + (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>; } //===----------------------------------------------------------------------===// @@ -2399,21 +2437,40 @@ multiclass sve2_misc_bitwise<bits<4> opc, string asm> { def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>; } -multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> { - let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in { - def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>; - def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>; - def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>; - def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>; - } -} - multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> { def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; } +class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b10010; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> { + def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>; + def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>; + def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>; + def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>; +} + class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm, ZPRRegOp zprty1, ZPRRegOp zprty2, Operand immtype> @@ -2451,9 +2508,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> { // SVE2 Accumulate Group //===----------------------------------------------------------------------===// -class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm, - ZPRRegOp zprty, Operand immtype> -: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), +class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", "", []>, Sched<[]> { bits<5> Zd; @@ -2468,38 +2525,40 @@ class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm, let Inst{10} = opc; let Inst{9-5} = Zn; let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; } -multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> { - def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { +multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> { + def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{20-19} = imm{4-3}; } - def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } } -multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> { - def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { +multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> { + def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{20-19} = imm{4-3}; } - def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } } -class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, - ZPRRegOp zprty, Operand immtype> +class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm), asm, "\t$Zda, $Zn, $imm", "", []>, Sched<[]> { @@ -2521,15 +2580,15 @@ class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm let ElementSize = ElementSizeNone; } -multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> { - def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { +multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> { + def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{20-19} = imm{4-3}; } - def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } @@ -2607,9 +2666,9 @@ multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> { // SVE2 Narrowing Group //===----------------------------------------------------------------------===// -class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc, - string asm, ZPRRegOp zprty1, - ZPRRegOp zprty2, Operand immtype> +class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> : I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", "", []>, Sched<[]> { @@ -2622,26 +2681,63 @@ class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc, let Inst{20-19} = tsz8_64{1-0}; let Inst{18-16} = imm{2-0}; // imm3 let Inst{15-14} = 0b00; - let Inst{13-10} = opc; + let Inst{13-11} = opc; + let Inst{10} = 0b0; let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> { - def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16, - vecshiftR8>; - def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32, - vecshiftR16> { +multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> { + def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16, + vecshiftR8>; + def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32, + vecshiftR16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64, - vecshiftR32> { + def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64, + vecshiftR32> { let Inst{20-19} = imm{4-3}; } } -class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm, - ZPRRegOp zprty1, ZPRRegOp zprty2> +class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-14} = 0b00; + let Inst{13-11} = opc; + let Inst{10} = 0b1; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> { + def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16, + vecshiftR8>; + def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32, + vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64, + vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } +} + +class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> : I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm), asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zd; @@ -2652,19 +2748,46 @@ class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm, let Inst{21} = 0b1; let Inst{20-16} = Zm; let Inst{15-13} = 0b011; - let Inst{12-10} = opc; // S, R, T + let Inst{12-11} = opc; // S, R + let Inst{10} = 0b0; // Top let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> { - def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>; - def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>; - def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>; +multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> { + def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>; +} + +class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-11} = opc; // S, R + let Inst{10} = 0b1; // Top + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> { + def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>; } -class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm, - ZPRRegOp zprty1, ZPRRegOp zprty2> +class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> : I<(outs zprty1:$Zd), (ins zprty2:$Zn), asm, "\t$Zd, $Zn", "", []>, Sched<[]> { bits<5> Zd; @@ -2674,15 +2797,41 @@ class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm, let Inst{21} = 0b1; let Inst{20-19} = tsz8_64{1-0}; let Inst{18-13} = 0b000010; - let Inst{12-10} = opc; + let Inst{12-11} = opc; + let Inst{10} = 0b0; let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> { - def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>; - def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>; - def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>; +multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> { + def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>; +} + +class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-13} = 0b000010; + let Inst{12-11} = opc; + let Inst{10} = 0b1; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> { + def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>; } //===----------------------------------------------------------------------===// @@ -2713,11 +2862,17 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc, let ElementSize = zprty.ElementSize; } -multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> { +multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm, + SDPatternOperator op> { def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>; def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> { @@ -2735,11 +2890,21 @@ multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> { def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; } -multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> { +multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm, + SDPatternOperator op> { def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>; def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> { @@ -3886,9 +4051,9 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty, (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; } -class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm, - RegisterOperand VecList> -: I<(outs VecList:$Zt), iops, +class sve2_mem_sstnt_vs_base<bits<3> opc, string asm, + RegisterOperand listty, ZPRRegOp zprty> +: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), asm, "\t$Zt, $Pg, [$Zn, $Rm]", "", []>, Sched<[]> { @@ -3908,17 +4073,14 @@ class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm, let mayStore = 1; } -multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm, +multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), - asm, listty>; + def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; - def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]", - (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; } @@ -4147,6 +4309,14 @@ class sve_int_perm_punpk<bit opc, string asm> let Inst{3-0} = Pd; } +multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> { + def NAME : sve_int_perm_punpk<opc, asm>; + + def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1, !cast<Instruction>(NAME)>; +} + class sve_int_rdffr_pred<bit s, string asm> : I<(outs PPR8:$Pd), (ins PPRAny:$Pg), asm, "\t$Pd, $Pg/z", @@ -5094,7 +5264,7 @@ multiclass sve_mem_p_fill<string asm> { (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>; } -class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm, +class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm, RegisterOperand VecList> : I<(outs VecList:$Zt), iops, asm, "\t$Zt, $Pg/z, [$Zn, $Rm]", @@ -5119,17 +5289,15 @@ class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm, let mayLoad = 1; } -multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm, +multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), + def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), asm, listty>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; - def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]", - (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index 7bb075c36e79..c27fc7a112ec 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -125,7 +125,7 @@ namespace llvm { uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) { // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name - Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$"); + static const Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$"); std::string UpperName = Name.upper(); SmallVector<StringRef, 5> Ops; diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e5e2fc2cb0df..7a4fcac09ec4 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -313,9 +313,9 @@ struct SysAlias { uint16_t Encoding; FeatureBitset FeaturesRequired; - SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {}; - SysAlias (const char *N, uint16_t E, FeatureBitset F) : - Name(N), Encoding(E), FeaturesRequired(F) {}; + constexpr SysAlias(const char *N, uint16_t E) : Name(N), Encoding(E) {} + constexpr SysAlias(const char *N, uint16_t E, FeatureBitset F) + : Name(N), Encoding(E), FeaturesRequired(F) {} bool haveFeatures(FeatureBitset ActiveFeatures) const { return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; @@ -326,9 +326,10 @@ struct SysAlias { struct SysAliasReg : SysAlias { bool NeedsReg; - SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {}; - SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F), - NeedsReg(R) {}; + constexpr SysAliasReg(const char *N, uint16_t E, bool R) + : SysAlias(N, E), NeedsReg(R) {} + constexpr SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) + : SysAlias(N, E, F), NeedsReg(R) {} }; namespace AArch64AT{ @@ -627,6 +628,18 @@ namespace AArch64II { /// MO_S - Indicates that the bits of the symbol operand represented by /// MO_G0 etc are signed. MO_S = 0x100, + + /// MO_PREL - Indicates that the bits of the symbol operand represented by + /// MO_G0 etc are PC relative. + MO_PREL = 0x200, + + /// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag + /// in bits 56-63. + /// On a FrameIndex operand, indicates that the underlying memory is tagged + /// with an unknown tag value (MTE); this needs to be lowered either to an + /// SP-relative load or store instruction (which do not check tags), or to + /// an LDG instruction to obtain the tag value. + MO_TAGGED = 0x400, }; } // end namespace AArch64II diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 19a8bd901629..b64422ae5427 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -188,6 +188,10 @@ ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); ModulePass *createR600OpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); +ModulePass *createAMDGPUPrintfRuntimeBinding(); +void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&); +extern char &AMDGPUPrintfRuntimeBindingID; + ModulePass* createAMDGPUUnifyMetadataPass(); void initializeAMDGPUUnifyMetadataPass(PassRegistry&); extern char &AMDGPUUnifyMetadataID; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index baeba534012c..42b477e07b3b 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -10,6 +10,15 @@ include "llvm/TableGen/SearchableTable.td" include "llvm/Target/Target.td" include "AMDGPUFeatures.td" +def p0 : PtrValueType<i64, 0>; +def p1 : PtrValueType<i64, 1>; +def p2 : PtrValueType<i32, 2>; +def p3 : PtrValueType<i32, 3>; +def p4 : PtrValueType<i64, 4>; +def p5 : PtrValueType<i32, 5>; +def p6 : PtrValueType<i32, 6>; + + class BoolToList<bit Value> { list<int> ret = !if(Value, [1]<int>, []<int>); } @@ -145,6 +154,12 @@ def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" >; +def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug", + "HasMFMAInlineLiteralBug", + "true", + "MFMA cannot use inline literal as SrcC" +>; + def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", "HasVcmpxPermlaneHazard", "true", @@ -802,6 +817,7 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, FeatureSRAMECC, + FeatureMFMAInlineLiteralBug, FeatureCodeObjectV3]>; def FeatureISAVersion9_0_9 : FeatureSet< diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 419ebb2240ad..e72b3f4fde63 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -173,6 +173,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID, case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + // TODO: Does not require queue ptr on gfx9+ case Intrinsic::trap: case Intrinsic::debugtrap: IsQueuePtr = true; @@ -194,18 +197,12 @@ static bool handleAttr(Function &Parent, const Function &Callee, static void copyFeaturesToFunction(Function &Parent, const Function &Callee, bool &NeedQueuePtr) { // X ids unnecessarily propagated to kernels. - static const StringRef AttrNames[] = { - { "amdgpu-work-item-id-x" }, - { "amdgpu-work-item-id-y" }, - { "amdgpu-work-item-id-z" }, - { "amdgpu-work-group-id-x" }, - { "amdgpu-work-group-id-y" }, - { "amdgpu-work-group-id-z" }, - { "amdgpu-dispatch-ptr" }, - { "amdgpu-dispatch-id" }, - { "amdgpu-kernarg-segment-ptr" }, - { "amdgpu-implicitarg-ptr" } - }; + static constexpr StringLiteral AttrNames[] = { + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"}; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 097730441ed8..f0e7ee910f95 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -48,8 +48,8 @@ public: return ArgDescriptor(Reg, Mask, false, true); } - static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) { - return ArgDescriptor(Reg, Mask, true, true); + static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { + return ArgDescriptor(Offset, Mask, true, true); } static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 743ac64b8f10..f2d903c8e7b1 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -229,7 +229,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { // alignment. Streamer.EmitValueToAlignment(64, 0, 1, 0); if (ReadOnlySection.getAlignment() < 64) - ReadOnlySection.setAlignment(64); + ReadOnlySection.setAlignment(Align(64)); const MCSubtargetInfo &STI = MF->getSubtarget(); @@ -273,7 +273,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { AsmPrinter::EmitFunctionEntryLabel(); } -void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { +void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( @@ -342,6 +342,8 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, + Optional<uint32_t> NumAGPR, + uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, @@ -349,6 +351,11 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments( OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); + if (NumAGPR) { + OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); + OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), + false); + } OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), false); @@ -417,7 +424,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // The starting address of all shader programs must be 256 bytes aligned. // Regular functions just need the basic required instruction alignment. - MF.setAlignment(MFI->isEntryFunction() ? 8 : 2); + MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); SetupMachineFunction(MF); @@ -474,6 +481,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; emitCommonFunctionComments( Info.NumVGPR, + STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(), + Info.getTotalNumVGPRs(STM), Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); @@ -481,7 +490,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } OutStreamer->emitRawComment(" Kernel info:", false); - emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, + emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR, + STM.hasMAIInsts() + ? CurrentProgramInfo.NumAccVGPR + : Optional<uint32_t>(), + CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); @@ -507,6 +520,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); OutStreamer->emitRawComment( + " Occupancy: " + + Twine(CurrentProgramInfo.Occupancy), false); + + OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); OutStreamer->emitRawComment( @@ -588,6 +605,11 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( UsesVCC, UsesFlatScratch); } +int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( + const GCNSubtarget &ST) const { + return std::max(NumVGPR, NumAGPR); +} + AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineFunction &MF) const { SIFunctionResourceInfo Info; @@ -634,11 +656,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( HighestVGPRReg = Reg; break; } - MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); - if (MRI.isPhysRegUsed(AReg)) { - HighestVGPRReg = AReg; - break; + } + + if (ST.hasMAIInsts()) { + MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestAGPRReg = Reg; + break; + } } + Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestAGPRReg) + 1; } MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; @@ -660,6 +689,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } int32_t MaxVGPR = -1; + int32_t MaxAGPR = -1; int32_t MaxSGPR = -1; uint64_t CalleeFrameSize = 0; @@ -669,11 +699,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( for (const MachineOperand &MO : MI.operands()) { unsigned Width = 0; bool IsSGPR = false; + bool IsAGPR = false; if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); switch (Reg) { case AMDGPU::EXEC: case AMDGPU::EXEC_LO: @@ -744,6 +775,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 1; } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 1; } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && @@ -755,6 +787,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 2; } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 2; } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { IsSGPR = false; @@ -771,6 +804,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 4; } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && @@ -790,6 +824,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 16; } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 16; } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { IsSGPR = true; @@ -799,6 +834,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 32; } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 32; } else { llvm_unreachable("Unknown register class"); @@ -807,6 +843,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( int MaxUsed = HWReg + Width - 1; if (IsSGPR) { MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else if (IsAGPR) { + MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; } else { MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; } @@ -828,6 +866,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); + MaxAGPR = std::max(MaxAGPR, 23); CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); Info.UsesVCC = true; @@ -852,6 +891,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); CalleeFrameSize = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); Info.UsesVCC |= I->second.UsesVCC; @@ -868,6 +908,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; + Info.NumAGPR = MaxAGPR + 1; Info.PrivateSegmentSize += CalleeFrameSize; return Info; @@ -876,8 +917,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - ProgInfo.NumVGPR = Info.NumVGPR; + ProgInfo.NumArchVGPR = Info.NumVGPR; + ProgInfo.NumAccVGPR = Info.NumAGPR; + ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); ProgInfo.NumSGPR = Info.NumExplicitSGPR; ProgInfo.ScratchSize = Info.PrivateSegmentSize; ProgInfo.VCCUsed = Info.UsesVCC; @@ -890,7 +934,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MF.getFunction().getContext().diagnose(DiagStackSize); } - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are @@ -1057,6 +1100,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); + + ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize, + ProgInfo.NumSGPRsForWavesPerEU, + ProgInfo.NumVGPRsForWavesPerEU); } static unsigned getRsrcReg(CallingConv::ID CallConv) { @@ -1214,17 +1261,16 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - unsigned MaxKernArgAlign; + Align MaxKernArgAlign; Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; - // These alignment values are specified in powers of two, so alignment = - // 2^n. The minimum alignment is 2^4 = 16. - Out.kernarg_segment_alignment = std::max<size_t>(4, - countTrailingZeros(MaxKernArgAlign)); + // kernarg_segment_alignment is specified as log of the alignment. + // The minimum alignment is 16. + Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index cf77034329ef..c50c19a4609c 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -43,6 +43,7 @@ private: // Track the number of explicitly used VGPRs. Special registers reserved at // the end are tracked separately. int32_t NumVGPR = 0; + int32_t NumAGPR = 0; int32_t NumExplicitSGPR = 0; uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; @@ -51,6 +52,7 @@ private: bool HasRecursion = false; int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; }; SIProgramInfo CurrentProgramInfo; @@ -77,6 +79,8 @@ private: void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); void emitCommonFunctionComments(uint32_t NumVGPR, + Optional<uint32_t> NumAGPR, + uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, @@ -125,7 +129,7 @@ public: void EmitFunctionEntryLabel() override; - void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; + void EmitBasicBlockStart(const MachineBasicBlock &MBB) override; void EmitGlobalVariable(const GlobalVariable *GV) override; @@ -140,8 +144,8 @@ public: const char *ExtraCode, raw_ostream &O) override; protected: - mutable std::vector<std::string> DisasmLines, HexLines; - mutable size_t DisasmLineMaxLen; + std::vector<std::string> DisasmLines, HexLines; + size_t DisasmLineMaxLen; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 8a92e7d923fb..ba8343142c63 100644 --- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" @@ -24,20 +25,10 @@ #define DEBUG_TYPE "amdgpu-atomic-optimizer" using namespace llvm; +using namespace llvm::AMDGPU; namespace { -enum DPP_CTRL { - DPP_ROW_SR1 = 0x111, - DPP_ROW_SR2 = 0x112, - DPP_ROW_SR3 = 0x113, - DPP_ROW_SR4 = 0x114, - DPP_ROW_SR8 = 0x118, - DPP_WF_SR1 = 0x138, - DPP_ROW_BCAST15 = 0x142, - DPP_ROW_BCAST31 = 0x143 -}; - struct ReplacementInfo { Instruction *I; AtomicRMWInst::BinOp Op; @@ -52,9 +43,12 @@ private: const LegacyDivergenceAnalysis *DA; const DataLayout *DL; DominatorTree *DT; - bool HasDPP; + const GCNSubtarget *ST; bool IsPixelShader; + Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const; + Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,8 +87,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { DT = DTW ? &DTW->getDomTree() : nullptr; const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - HasDPP = ST.hasDPP(); + ST = &TM.getSubtarget<GCNSubtarget>(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; visit(F); @@ -142,17 +135,18 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { // If the pointer operand is divergent, then each lane is doing an atomic // operation on a different address, and we cannot optimize that. - if (DA->isDivergent(I.getOperand(PtrIdx))) { + if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { return; } - const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + if (ValDivergent && + (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { return; } @@ -219,20 +213,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { const unsigned ValIdx = 0; - const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + if (ValDivergent && + (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { return; } // If any of the other arguments to the intrinsic are divergent, we can't // optimize the operation. for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { - if (DA->isDivergent(I.getOperand(Idx))) { + if (DA->isDivergentUse(&I.getOperandUse(Idx))) { return; } } @@ -282,6 +277,111 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, return B.CreateSelect(Cond, LHS, RHS); } +// Use the builder to create an inclusive scan of V across the wavefront, with +// all lanes active. +Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *V, Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Function *PermLaneX16 = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + + for (unsigned Idx = 0; Idx < 4; Idx++) { + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + } + if (ST->hasDPPBroadcasts()) { + // GFX9 has DPP row broadcast operations. + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), + B.getInt32(0xf), B.getFalse()})); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), + B.getInt32(0xf), B.getFalse()})); + } else { + // On GFX10 all DPP operations are confined to a single row. To get cross- + // row operations we have to use permlane or readlane. + + // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes + // 48..63). + Value *const PermX = + B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), + B.getFalse(), B.getFalse()}); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); + if (!ST->isWave32()) { + // Combine lane 31 into lanes 32..63. + Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); + } + } + return V; +} + +// Use the builder to create a shift right of V across the wavefront, with all +// lanes active, to turn an inclusive scan into an exclusive scan. +Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, + Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + + if (ST->hasDPPWavefrontShifts()) { + // GFX9 has DPP wavefront shift operations. + V = B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), + B.getInt32(0xf), B.getFalse()}); + } else { + // On GFX10 all DPP operations are confined to a single row. To get cross- + // row operations we have to use permlane or readlane. + Value *Old = V; + V = B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + + // Copy the old lane 15 to the new lane 16. + V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), + B.getInt32(16), V}); + + if (!ST->isWave32()) { + // Copy the old lane 31 to the new lane 32. + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); + + // Copy the old lane 47 to the new lane 48. + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); + } + } + + return V; +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -345,23 +445,29 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // We need to know how many lanes are active within the wavefront, and we do // this by doing a ballot of active lanes. + Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); CallInst *const Ballot = B.CreateIntrinsic( - Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()}, + Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()}, {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is // below us only if its associated index was less than ours. We do this by // using the mbcnt intrinsic. - Value *const BitCast = B.CreateBitCast(Ballot, VecTy); - Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); - Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); - CallInst *const PartialMbcnt = B.CreateIntrinsic( - Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); - Value *const Mbcnt = - B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, - {ExtractHi, PartialMbcnt}), - Ty, false); + Value *Mbcnt; + if (ST->isWave32()) { + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {Ballot, B.getInt32(0)}); + } else { + Value *const BitCast = B.CreateBitCast(Ballot, VecTy); + Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); + Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {ExtractLo, B.getInt32(0)}); + Mbcnt = + B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); + } + Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); @@ -373,47 +479,25 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, if (ValDivergent) { // First we need to set all inactive invocations to the identity value, so // that they can correctly contribute to the final result. - CallInst *const SetInactive = - B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - - CallInst *const FirstDPP = - B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, - {Identity, SetInactive, B.getInt32(DPP_WF_SR1), - B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - ExclScan = FirstDPP; - - const unsigned Iters = 7; - const unsigned DPPCtrl[Iters] = { - DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, - DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - // This loop performs an exclusive scan across the wavefront, with all lanes - // active (by using the WWM intrinsic). - for (unsigned Idx = 0; Idx < Iters; Idx++) { - Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; - CallInst *const DPP = B.CreateIntrinsic( - Intrinsic::amdgcn_update_dpp, Ty, - {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), - B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); - - ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); - } - - NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + NewV = buildScan(B, ScanOp, NewV, Identity); + ExclScan = buildShiftRight(B, NewV, Identity); // Read the value from the last lane, which has accumlated the values of // each active lane in the wavefront. This will be our new value which we // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); + B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); + Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); + Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); Value *const Insert = @@ -421,7 +505,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, NewV = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, B.getInt32(63)}); + {NewV, LastLaneIdx}); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -493,77 +577,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // original instruction. B.SetInsertPoint(&I); - // Create a PHI node to get our new atomic result into the exit block. - PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(UndefValue::get(Ty), EntryBB); - PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); + const bool NeedResult = !I.use_empty(); + if (NeedResult) { + // Create a PHI node to get our new atomic result into the exit block. + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), EntryBB); + PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); - // We need to broadcast the value who was the lowest active lane (the first - // lane) to all other lanes in the wavefront. We use an intrinsic for this, - // but have to handle 64-bit broadcasts with two calls to this intrinsic. - Value *BroadcastI = nullptr; + // We need to broadcast the value who was the lowest active lane (the first + // lane) to all other lanes in the wavefront. We use an intrinsic for this, + // but have to handle 64-bit broadcasts with two calls to this intrinsic. + Value *BroadcastI = nullptr; - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); - CallInst *const ReadFirstLaneLo = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - CallInst *const ReadFirstLaneHi = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); - BroadcastI = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { + if (TyBitWidth == 64) { + Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); + Value *const ExtractHi = + B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); + CallInst *const ReadFirstLaneLo = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); + CallInst *const ReadFirstLaneHi = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); + Value *const PartialInsert = B.CreateInsertElement( + UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); + Value *const Insert = + B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); + BroadcastI = B.CreateBitCast(Insert, Ty); + } else if (TyBitWidth == 32) { - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); + } else { + llvm_unreachable("Unhandled atomic bit width"); + } - // Now that we have the result of our single atomic operation, we need to - // get our individual lane's slice into the result. We use the lane offset we - // previously calculated combined with the atomic result value we got from the - // first lane, to get our lane's index into the atomic result. - Value *LaneOffset = nullptr; - if (ValDivergent) { - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); - } else { - switch (Op) { - default: - llvm_unreachable("Unhandled atomic op"); - case AtomicRMWInst::Add: - case AtomicRMWInst::Sub: - LaneOffset = B.CreateMul(V, Mbcnt); - break; - case AtomicRMWInst::And: - case AtomicRMWInst::Or: - case AtomicRMWInst::Max: - case AtomicRMWInst::Min: - case AtomicRMWInst::UMax: - case AtomicRMWInst::UMin: - LaneOffset = B.CreateSelect(Cond, Identity, V); - break; - case AtomicRMWInst::Xor: - LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); - break; + // Now that we have the result of our single atomic operation, we need to + // get our individual lane's slice into the result. We use the lane offset + // we previously calculated combined with the atomic result value we got + // from the first lane, to get our lane's index into the atomic result. + Value *LaneOffset = nullptr; + if (ValDivergent) { + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + } else { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + LaneOffset = B.CreateMul(V, Mbcnt); + break; + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + LaneOffset = B.CreateSelect(Cond, Identity, V); + break; + case AtomicRMWInst::Xor: + LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); + break; + } } - } - Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); + Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); - if (IsPixelShader) { - // Need a final PHI to reconverge to above the helper lane branch mask. - B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); + if (IsPixelShader) { + // Need a final PHI to reconverge to above the helper lane branch mask. + B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); - PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); - PHI->addIncoming(Result, I.getParent()); - I.replaceAllUsesWith(PHI); - } else { - // Replace the original atomic instruction with the new one. - I.replaceAllUsesWith(Result); + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); + PHI->addIncoming(Result, I.getParent()); + I.replaceAllUsesWith(PHI); + } else { + // Replace the original atomic instruction with the new one. + I.replaceAllUsesWith(Result); + } } // And delete the original. diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index b107c357196d..58c44acde1a7 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -30,13 +30,15 @@ using namespace llvm; namespace { -struct OutgoingArgHandler : public CallLowering::ValueHandler { - OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} +struct OutgoingValueHandler : public CallLowering::ValueHandler { + OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : ValueHandler(B, MRI, AssignFn), MIB(MIB) {} MachineInstrBuilder MIB; + bool isIncomingArgumentHandler() const override { return false; } + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { llvm_unreachable("not implemented"); @@ -49,15 +51,96 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { - MIB.addUse(PhysReg); - MIRBuilder.buildCopy(PhysReg, ValVReg); + Register ExtReg; + if (VA.getLocVT().getSizeInBits() < 32) { + // 16-bit types are reported as legal for 32-bit registers. We need to + // extend and do a 32-bit copy to avoid the verifier complaining about it. + ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); + } else + ExtReg = extendRegister(ValVReg, VA); + + MIRBuilder.buildCopy(PhysReg, ExtReg); + MIB.addUse(PhysReg, RegState::Implicit); } bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, + ISD::ArgFlagsTy Flags, CCState &State) override { - return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + } +}; + +struct IncomingArgHandler : public CallLowering::ValueHandler { + uint64_t StackUsed = 0; + + IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(B, MRI, AssignFn) {} + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + int FI = MFI.CreateFixedObject(Size, Offset, true); + MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); + Register AddrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32)); + MIRBuilder.buildFrameIndex(AddrReg, FI); + StackUsed = std::max(StackUsed, Size + Offset); + return AddrReg; + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + markPhysRegUsed(PhysReg); + + if (VA.getLocVT().getSizeInBits() < 32) { + // 16-bit types are reported as legal for 32-bit registers. We need to do + // a 32-bit copy, and truncate to avoid the verifier complaining about it. + auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + return; + } + + switch (VA.getLocInfo()) { + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + } + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + // FIXME: Get alignment + auto MMO = MIRBuilder.getMF().getMachineMemOperand( + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1); + MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + } + + /// How the physical register gets marked varies between formal + /// parameters (it's a basic-block live-in), and a call instruction + /// (it's an implicit-def of the BL). + virtual void markPhysRegUsed(unsigned PhysReg) = 0; + + // FIXME: What is the point of this being a callback? + bool isIncomingArgumentHandler() const override { return true; } +}; + +struct FormalArgHandler : public IncomingArgHandler { + FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingArgHandler(B, MRI, AssignFn) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -67,55 +150,198 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI) { } -bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, +void AMDGPUCallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, + SplitArgTy PerformArgSplit) const { + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + LLVMContext &Ctx = OrigArg.Ty->getContext(); + + if (OrigArg.Ty->isVoidTy()) + return; + + SmallVector<EVT, 4> SplitVTs; + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); + + assert(OrigArg.Regs.size() == SplitVTs.size()); + + int SplitIdx = 0; + for (EVT VT : SplitVTs) { + unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + Type *Ty = VT.getTypeForEVT(Ctx); + + + + if (NumParts == 1) { + // No splitting to do, but we want to replace the original type (e.g. [1 x + // double] -> double). + SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty, + OrigArg.Flags, OrigArg.IsFixed); + + ++SplitIdx; + continue; + } + + LLT LLTy = getLLTForType(*Ty, DL); + + SmallVector<Register, 8> SplitRegs; + + EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); + Type *PartTy = PartVT.getTypeForEVT(Ctx); + LLT PartLLT = getLLTForType(*PartTy, DL); + + // FIXME: Should we be reporting all of the part registers for a single + // argument, and let handleAssignments take care of the repacking? + for (unsigned i = 0; i < NumParts; ++i) { + Register PartReg = MRI.createGenericVirtualRegister(PartLLT); + SplitRegs.push_back(PartReg); + SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); + } + + PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx); + + ++SplitIdx; + } +} + +// Get the appropriate type to make \p OrigTy \p Factor times bigger. +static LLT getMultipleType(LLT OrigTy, int Factor) { + if (OrigTy.isVector()) { + return LLT::vector(OrigTy.getNumElements() * Factor, + OrigTy.getElementType()); + } + + return LLT::scalar(OrigTy.getSizeInBits() * Factor); +} + +// TODO: Move to generic code +static void unpackRegsToOrigType(MachineIRBuilder &B, + ArrayRef<Register> DstRegs, + Register SrcReg, + LLT SrcTy, + LLT PartTy) { + assert(DstRegs.size() > 1 && "Nothing to unpack"); + + MachineFunction &MF = B.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const unsigned SrcSize = SrcTy.getSizeInBits(); + const unsigned PartSize = PartTy.getSizeInBits(); + + if (SrcTy.isVector() && !PartTy.isVector() && + PartSize > SrcTy.getElementType().getSizeInBits()) { + // Vector was scalarized, and the elements extended. + auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), + SrcReg); + for (int i = 0, e = DstRegs.size(); i != e; ++i) + B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); + return; + } + + if (SrcSize % PartSize == 0) { + B.buildUnmerge(DstRegs, SrcReg); + return; + } + + const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; + + LLT BigTy = getMultipleType(PartTy, NumRoundedParts); + auto ImpDef = B.buildUndef(BigTy); + + Register BigReg = MRI.createGenericVirtualRegister(BigTy); + B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0); + + int64_t Offset = 0; + for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) + B.buildExtract(DstRegs[i], BigReg, Offset); +} + +/// Lower the return value for the already existing \p Ret. This assumes that +/// \p B's insertion point is correct. +bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, + const Value *Val, ArrayRef<Register> VRegs, + MachineInstrBuilder &Ret) const { + if (!Val) + return true; + + auto &MF = B.getMF(); + const auto &F = MF.getFunction(); + const DataLayout &DL = MF.getDataLayout(); + + CallingConv::ID CC = F.getCallingConv(); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + ArgInfo OrigRetInfo(VRegs, Val->getType()); + setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); + SmallVector<ArgInfo, 4> SplitRetInfos; + + splitToValueTypes( + OrigRetInfo, SplitRetInfos, DL, MRI, CC, + [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { + unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT); + }); + + CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); + + OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn); + return handleAssignments(B, SplitRetInfos, RetHandler); +} + +bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef<Register> VRegs) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MFI->setIfReturnsVoid(!Val); - if (!Val) { - MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + assert(!Val == VRegs.empty() && "Return value without a vreg"); + + CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); + const bool IsShader = AMDGPU::isShader(CC); + const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || + AMDGPU::isKernel(CC); + if (IsWaveEnd) { + B.buildInstr(AMDGPU::S_ENDPGM) + .addImm(0); return true; } - Register VReg = VRegs[0]; + auto const &ST = B.getMF().getSubtarget<GCNSubtarget>(); - const Function &F = MF.getFunction(); - auto &DL = F.getParent()->getDataLayout(); - if (!AMDGPU::isShader(F.getCallingConv())) - return false; + unsigned ReturnOpc = + IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; + auto Ret = B.buildInstrNoInsert(ReturnOpc); + Register ReturnAddrVReg; + if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { + ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); + Ret.addUse(ReturnAddrVReg); + } - const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); - SmallVector<EVT, 4> SplitVTs; - SmallVector<uint64_t, 4> Offsets; - ArgInfo OrigArg{VReg, Val->getType()}; - setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); - ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + if (!lowerReturnVal(B, Val, VRegs, Ret)) + return false; - SmallVector<ArgInfo, 8> SplitArgs; - CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false); - for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { - Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext()); - SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed}); + if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), + &AMDGPU::SGPR_64RegClass); + B.buildCopy(ReturnAddrVReg, LiveInReturn); } - auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG); - OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) - return false; - MIRBuilder.insertInstr(RetInstr); + // TODO: Handle CalleeSavedRegsViaCopy. + + B.insertInstr(Ret); return true; } -Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, +Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); @@ -128,79 +354,37 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); - MIRBuilder.buildConstant(OffsetReg, Offset); + B.buildConstant(OffsetReg, Offset); - MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); + B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); return DstReg; } -void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, unsigned Align, Register DstReg) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | - MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, TypeSize, Align); - MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); -} - -static Register findFirstFreeSGPR(CCState &CCInfo) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { - if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { - return AMDGPU::SGPR0 + Reg; - } - } - llvm_unreachable("Cannot allocate sgpr"); -} - -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { - const LLT S32 = LLT::scalar(32); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - if (Info.hasWorkItemIDX()) { - Register Reg = AMDGPU::VGPR0; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); - } - - if (Info.hasWorkItemIDY()) { - Register Reg = AMDGPU::VGPR1; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); - } - - if (Info.hasWorkItemIDZ()) { - Register Reg = AMDGPU::VGPR2; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); - } + B.buildLoad(DstReg, PtrReg, *MMO); } // Allocate special inputs passed in user SGPRs. static void allocateHSAUserSGPRs(CCState &CCInfo, - MachineIRBuilder &MIRBuilder, + MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { @@ -229,8 +413,8 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); Register VReg = MRI.createGenericVirtualRegister(P4); MRI.addLiveIn(InputPtrReg, VReg); - MIRBuilder.getMBB().addLiveIn(InputPtrReg); - MIRBuilder.buildCopy(VReg, InputPtrReg); + B.getMBB().addLiveIn(InputPtrReg); + B.buildCopy(VReg, InputPtrReg); CCInfo.AllocateReg(InputPtrReg); } @@ -250,74 +434,22 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, // these from the dispatch pointer. } -static void allocateSystemSGPRs(CCState &CCInfo, - MachineFunction &MF, - SIMachineFunctionInfo &Info, - CallingConv::ID CallConv, - bool IsShader) { - const LLT S32 = LLT::scalar(32); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDZ()) { - unsigned Reg = Info.addWorkGroupIDZ(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupInfo()) { - unsigned Reg = Info.addWorkGroupInfo(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasPrivateSegmentWaveByteOffset()) { - // Scratch wave offset passed in system SGPR. - unsigned PrivateSegmentWaveByteOffsetReg; - - if (IsShader) { - PrivateSegmentWaveByteOffsetReg = - Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); - - // This is true if the scratch wave byte offset doesn't have a fixed - // location. - if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { - PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); - Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); - } - } else - PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); - - MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); - CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); - } -} - bool AMDGPUCallLowering::lowerFormalArgumentsKernel( - MachineIRBuilder &MIRBuilder, const Function &F, + MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + const DataLayout &DL = F.getParent()->getDataLayout(); SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); - allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); + allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); unsigned i = 0; const unsigned KernArgBaseAlign = 16; @@ -343,123 +475,242 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); - lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg); + lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg); if (OrigArgRegs.size() > 1) - unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder); + unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); ++i; } - allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); - allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); + TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; } +// TODO: Move this to generic code +static void packSplitRegsToOrigType(MachineIRBuilder &B, + ArrayRef<Register> OrigRegs, + ArrayRef<Register> Regs, + LLT LLTy, + LLT PartLLT) { + if (!LLTy.isVector() && !PartLLT.isVector()) { + B.buildMerge(OrigRegs[0], Regs); + return; + } + + if (LLTy.isVector() && PartLLT.isVector()) { + assert(LLTy.getElementType() == PartLLT.getElementType()); + + int DstElts = LLTy.getNumElements(); + int PartElts = PartLLT.getNumElements(); + if (DstElts % PartElts == 0) + B.buildConcatVectors(OrigRegs[0], Regs); + else { + // Deal with v3s16 split into v2s16 + assert(PartElts == 2 && DstElts % 2 != 0); + int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); + + LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); + auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs); + B.buildExtract(OrigRegs[0], RoundedConcat, 0); + } + + return; + } + + assert(LLTy.isVector() && !PartLLT.isVector()); + + LLT DstEltTy = LLTy.getElementType(); + if (DstEltTy == PartLLT) { + // Vector was trivially scalarized. + B.buildBuildVector(OrigRegs[0], Regs); + } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { + // Deal with vector with 64-bit elements decomposed to 32-bit + // registers. Need to create intermediate 64-bit elements. + SmallVector<Register, 8> EltMerges; + int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); + + assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); + + for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { + auto Merge = B.buildMerge(DstEltTy, + Regs.take_front(PartsPerElt)); + EltMerges.push_back(Merge.getReg(0)); + Regs = Regs.drop_front(PartsPerElt); + } + + B.buildBuildVector(OrigRegs[0], EltMerges); + } else { + // Vector was split, and elements promoted to a wider type. + LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); + auto BV = B.buildBuildVector(BVType, Regs); + B.buildTrunc(OrigRegs[0], BV); + } +} + bool AMDGPUCallLowering::lowerFormalArguments( - MachineIRBuilder &MIRBuilder, const Function &F, + MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { + CallingConv::ID CC = F.getCallingConv(); + // The infrastructure for normal calling convention lowering is essentially // useless for kernels. We want to avoid any kind of legalization or argument // splitting. - if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) - return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs); + if (CC == CallingConv::AMDGPU_KERNEL) + return lowerFormalArgumentsKernel(B, F, VRegs); - // AMDGPU_GS and AMDGP_HS are not supported yet. - if (F.getCallingConv() == CallingConv::AMDGPU_GS || - F.getCallingConv() == CallingConv::AMDGPU_HS) - return false; + const bool IsShader = AMDGPU::isShader(CC); + const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); + MachineBasicBlock &MBB = B.getMBB(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - bool IsShader = AMDGPU::isShader(F.getCallingConv()); SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); + + if (!IsEntryFunc) { + Register ReturnAddrReg = TRI->getReturnAddressReg(MF); + Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, + &AMDGPU::SGPR_64RegClass); + MBB.addLiveIn(ReturnAddrReg); + B.buildCopy(LiveInReturn, ReturnAddrReg); + } if (Info->hasImplicitBufferPtr()) { - unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); + Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } - unsigned NumArgs = F.arg_size(); - Function::const_arg_iterator CurOrigArg = F.arg_begin(); - const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + + SmallVector<ArgInfo, 32> SplitArgs; + unsigned Idx = 0; unsigned PSInputNum = 0; - BitVector Skipped(NumArgs); - for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { - EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); - // We can only hanlde simple value types at the moment. - ISD::ArgFlagsTy Flags; - assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); - ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; - setArgFlags(OrigArg, i + 1, DL, F); - Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); + for (auto &Arg : F.args()) { + if (DL.getTypeStoreSize(Arg.getType()) == 0) + continue; - if (F.getCallingConv() == CallingConv::AMDGPU_PS && - !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && - PSInputNum <= 15) { - if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { - Skipped.set(i); - ++PSInputNum; - continue; - } + const bool InReg = Arg.hasAttribute(Attribute::InReg); - Info->markPSInputAllocated(PSInputNum); - if (!CurOrigArg->use_empty()) - Info->markPSInputEnabled(PSInputNum); + // SGPR arguments to functions not implemented. + if (!IsShader && InReg) + return false; + + if (Arg.hasAttribute(Attribute::SwiftSelf) || + Arg.hasAttribute(Attribute::SwiftError) || + Arg.hasAttribute(Attribute::Nest)) + return false; + + if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { + const bool ArgUsed = !Arg.use_empty(); + bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); + + if (!SkipArg) { + Info->markPSInputAllocated(PSInputNum); + if (ArgUsed) + Info->markPSInputEnabled(PSInputNum); + } ++PSInputNum; + + if (SkipArg) { + for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) + B.buildUndef(VRegs[Idx][I]); + + ++Idx; + continue; + } } - CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), - /*IsVarArg=*/false); + ArgInfo OrigArg(VRegs[Idx], Arg.getType()); + setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); - if (ValEVT.isVector()) { - EVT ElemVT = ValEVT.getVectorElementType(); - if (!ValEVT.isSimple()) - return false; - MVT ValVT = ElemVT.getSimpleVT(); - bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, - OrigArg.Flags, CCInfo); - if (!Res) - return false; - } else { - MVT ValVT = ValEVT.getSimpleVT(); - if (!ValEVT.isSimple()) - return false; - bool Res = - AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); + splitToValueTypes( + OrigArg, SplitArgs, DL, MRI, CC, + // FIXME: We should probably be passing multiple registers to + // handleAssignments to do this + [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { + packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, + LLTy, PartLLT); + }); - // Fail if we don't know how to handle this type. - if (Res) - return false; - } + ++Idx; } - Function::const_arg_iterator Arg = F.arg_begin(); + // At least one interpolation mode must be enabled or else the GPU will + // hang. + // + // Check PSInputAddr instead of PSInputEnable. The idea is that if the user + // set PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. + if (CC == CallingConv::AMDGPU_PS) { + if ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } - if (F.getCallingConv() == CallingConv::AMDGPU_VS || - F.getCallingConv() == CallingConv::AMDGPU_PS) { - for (unsigned i = 0, OrigArgIdx = 0; - OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { - if (Skipped.test(OrigArgIdx)) - continue; - assert(VRegs[OrigArgIdx].size() == 1 && - "Can't lower into more than 1 reg"); - CCValAssign &VA = ArgLocs[i++]; - MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); - MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); - MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); + if (Subtarget.isAmdPalOS()) { + // For isAmdPalOS, the user does not enable some bits after compilation + // based on run-time states; the register values being generated here are + // the final ones set in hardware. Therefore we need to apply the + // workaround to PSInputAddr and PSInputEnable together. (The case where + // a bit is set in PSInputAddr but not PSInputEnable is where the frontend + // set up an input arg for a particular interpolation mode, but nothing + // uses that input arg. Really we should have an earlier pass that removes + // such an arg.) + unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); + if ((PsInputBits & 0x7F) == 0 || + ((PsInputBits & 0xF) == 0 && + (PsInputBits >> 11 & 1))) + Info->markPSInputEnabled( + countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); } + } - allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); - return true; + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); + + if (!MBB.empty()) + B.setInstr(*MBB.begin()); + + FormalArgHandler Handler(B, MRI, AssignFn); + if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) + return false; + + if (!IsEntryFunc) { + // Special inputs come after user arguments. + TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + } + + // Start adding system SGPRs. + if (IsEntryFunc) { + TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); + } else { + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); + CCInfo.AllocateReg(Info->getFrameOffsetReg()); + TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } - return false; + // Move back to the end of the basic block. + B.setMBB(MBB); + + return true; } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index 3599659cac6a..53a562586bc0 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -20,26 +20,37 @@ namespace llvm { class AMDGPUTargetLowering; +class MachineInstrBuilder; class AMDGPUCallLowering: public CallLowering { - Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset) const; - void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, - uint64_t Offset, unsigned Align, - Register DstReg) const; + void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, + unsigned Align, Register DstReg) const; - public: + /// A function of this type is used to perform value split action. + using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>; + + void splitToValueTypes(const ArgInfo &OrigArgInfo, + SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + CallingConv::ID CallConv, + SplitArgTy SplitArg) const; + + bool lowerReturnVal(MachineIRBuilder &B, const Value *Val, + ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const; + +public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); - bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef<Register> VRegs) const override; - bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder, - const Function &F, + bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const; - bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 3688cd77542e..f8a54a61aac2 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -24,22 +24,9 @@ def CC_SI : CallingConv<[ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, - SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, - SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, - SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, - SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, - SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, - SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, - SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, - SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, - SGPR104, SGPR105 + SGPR40, SGPR41, SGPR42, SGPR43 ]>>>, - // We have no way of referring to the generated register tuples - // here, so use a custom function. - CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, - CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, - // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, @@ -69,15 +56,7 @@ def RetCC_SI_Shader : CallingConv<[ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, - SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, - SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, - SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, - SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, - SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, - SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, - SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, - SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, - SGPR104, SGPR105 + SGPR40, SGPR41, SGPR42, SGPR43 ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. @@ -138,7 +117,6 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, @@ -157,7 +135,6 @@ def RetCC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">> ]>; def CC_AMDGPU : CallingConv<[ diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index b750c6b5f6d2..1640a4a59ee2 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -55,6 +55,12 @@ static cl::opt<bool> WidenLoads( cl::ReallyHidden, cl::init(true)); +static cl::opt<bool> UseMul24Intrin( + "amdgpu-codegenprepare-mul24", + cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(true)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { const GCNSubtarget *ST = nullptr; @@ -509,7 +515,9 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { } } - I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); + Value *NewVal = insertValues(Builder, Ty, ResultVals); + NewVal->takeName(&I); + I.replaceAllUsesWith(NewVal); I.eraseFromParent(); return true; @@ -879,7 +887,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; - if (replaceMulWithMul24(I)) + if (UseMul24Intrin && replaceMulWithMul24(I)) return true; bool Changed = false; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index e80797736363..61ce83b30e00 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -13,9 +13,9 @@ #include "AMDGPUFrameLowering.h" using namespace llvm; -AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, - int LAO, unsigned TransAl) - : TargetFrameLowering(D, StackAl, LAO, TransAl) { } +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, Align StackAl, + int LAO, Align TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) {} AMDGPUFrameLowering::~AMDGPUFrameLowering() = default; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 48b64488303e..92e256cf2829 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -25,8 +25,8 @@ namespace llvm { /// See TargetFrameInfo for more comments. class AMDGPUFrameLowering : public TargetFrameLowering { public: - AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1); + AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO, + Align TransAl = Align::None()); ~AMDGPUFrameLowering() override; /// \returns The number of 32-bit sub-registers that are used when storing diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td index cad4c2ef404c..f2be1ca44d34 100644 --- a/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/lib/Target/AMDGPU/AMDGPUGISel.td @@ -12,10 +12,6 @@ include "AMDGPU.td" -def p0 : PtrValueType<i64, 0>; -def p1 : PtrValueType<i64, 1>; -def p4 : PtrValueType<i64, 4>; - def sd_vsrc0 : ComplexPattern<i32, 1, "">; def gi_vsrc0 : GIComplexOperandMatcher<s32, "selectVSRC0">, @@ -38,6 +34,18 @@ def gi_vop3omods : GIComplexOperandMatcher<s32, "selectVOP3OMods">, GIComplexPatternEquiv<VOP3OMods>; +def gi_vop3omods0clamp0omod : + GIComplexOperandMatcher<s32, "selectVOP3Mods0Clamp0OMod">, + GIComplexPatternEquiv<VOP3Mods0Clamp0OMod>; + +def gi_vop3opselmods0 : + GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">, + GIComplexPatternEquiv<VOP3OpSelMods0>; + +def gi_vop3opselmods : + GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, + GIComplexPatternEquiv<VOP3OpSelMods>; + def gi_smrd_imm : GIComplexOperandMatcher<s64, "selectSmrdImm">, GIComplexPatternEquiv<SMRDImm>; @@ -50,12 +58,19 @@ def gi_smrd_sgpr : GIComplexOperandMatcher<s64, "selectSmrdSgpr">, GIComplexPatternEquiv<SMRDSgpr>; +// FIXME: Why are the atomic versions separated? def gi_flat_offset : GIComplexOperandMatcher<s64, "selectFlatOffset">, GIComplexPatternEquiv<FLATOffset>; def gi_flat_offset_signed : GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">, GIComplexPatternEquiv<FLATOffsetSigned>; +def gi_flat_atomic : + GIComplexOperandMatcher<s64, "selectFlatOffset">, + GIComplexPatternEquiv<FLATAtomic>; +def gi_flat_signed_atomic : + GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">, + GIComplexPatternEquiv<FLATSignedAtomic>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, @@ -64,6 +79,44 @@ def gi_mubuf_scratch_offen : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">, GIComplexPatternEquiv<MUBUFScratchOffen>; +def gi_ds_1addr_1offset : + GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">, + GIComplexPatternEquiv<DS1Addr1Offset>; + + +// Separate load nodes are defined to glue m0 initialization in +// SelectionDAG. The GISel selector can just insert m0 initialization +// directly before before selecting a glue-less load, so hide this +// distinction. + +def : GINodeEquiv<G_LOAD, AMDGPUld_glue> { + let CheckMMOIsNonAtomic = 1; +} + +def : GINodeEquiv<G_STORE, AMDGPUst_glue> { + let CheckMMOIsNonAtomic = 1; +} + +def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> { + bit CheckMMOIsAtomic = 1; +} + + + +def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>; +def : GINodeEquiv<G_ATOMICRMW_XCHG, atomic_swap_glue>; +def : GINodeEquiv<G_ATOMICRMW_ADD, atomic_load_add_glue>; +def : GINodeEquiv<G_ATOMICRMW_SUB, atomic_load_sub_glue>; +def : GINodeEquiv<G_ATOMICRMW_AND, atomic_load_and_glue>; +def : GINodeEquiv<G_ATOMICRMW_OR, atomic_load_or_glue>; +def : GINodeEquiv<G_ATOMICRMW_XOR, atomic_load_xor_glue>; +def : GINodeEquiv<G_ATOMICRMW_MIN, atomic_load_min_glue>; +def : GINodeEquiv<G_ATOMICRMW_MAX, atomic_load_max_glue>; +def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>; +def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>; +def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>; + +def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>; class GISelSop2Pat < SDPatternOperator node, @@ -143,20 +196,6 @@ multiclass GISelVop2IntrPat < def : GISelSop2Pat <or, S_OR_B32, i32>; def : GISelVop2Pat <or, V_OR_B32_e32, i32>; -// FIXME: We can't re-use SelectionDAG patterns here because they match -// against a custom SDNode and we would need to create a generic machine -// instruction that is equivalent to the custom SDNode. This would also require -// us to custom legalize the intrinsic to the new generic machine instruction, -// but I can't get custom legalizing of intrinsic to work and I'm not sure if -// this is even supported yet. -def : GISelVop3Pat2ModsPat < - int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>; - -defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>; -def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>; -defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>; -def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>; - // Since GlobalISel is more flexible then SelectionDAG, I think we can get // away with adding patterns for integer types and not legalizing all // loads and stores to vector types. This should help simplify the load/store @@ -164,3 +203,6 @@ def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>; foreach Ty = [i64, p0, p1, p4] in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; } + +def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">, + GISDNodeXFormEquiv<as_i32timm>; diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 0a1f48231b18..85d1ad349157 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -22,15 +22,17 @@ enum PartialMappingIdx { PM_SGPR128 = 9, PM_SGPR256 = 10, PM_SGPR512 = 11, - PM_VGPR1 = 12, - PM_VGPR16 = 16, - PM_VGPR32 = 17, - PM_VGPR64 = 18, - PM_VGPR128 = 19, - PM_VGPR256 = 20, - PM_VGPR512 = 21, - PM_SGPR96 = 22, - PM_VGPR96 = 23 + PM_SGPR1024 = 12, + PM_VGPR1 = 13, + PM_VGPR16 = 17, + PM_VGPR32 = 18, + PM_VGPR64 = 19, + PM_VGPR128 = 20, + PM_VGPR256 = 21, + PM_VGPR512 = 22, + PM_VGPR1024 = 23, + PM_SGPR96 = 24, + PM_VGPR96 = 25 }; const RegisterBankInfo::PartialMapping PartMappings[] { @@ -45,6 +47,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] { {0, 128, SGPRRegBank}, {0, 256, SGPRRegBank}, {0, 512, SGPRRegBank}, + {0, 1024, SGPRRegBank}, {0, 1, VGPRRegBank}, // VGPR begin {0, 16, VGPRRegBank}, @@ -53,8 +56,9 @@ const RegisterBankInfo::PartialMapping PartMappings[] { {0, 128, VGPRRegBank}, {0, 256, VGPRRegBank}, {0, 512, VGPRRegBank}, + {0, 1024, VGPRRegBank}, {0, 96, SGPRRegBank}, - {0, 96, VGPRRegBank}, + {0, 96, VGPRRegBank} }; const RegisterBankInfo::ValueMapping ValMappings[] { @@ -65,41 +69,43 @@ const RegisterBankInfo::ValueMapping ValMappings[] { {&PartMappings[1], 1}, // SGPRs - {&PartMappings[2], 1}, + {&PartMappings[2], 1}, // 1 {nullptr, 0}, // Illegal power of 2 sizes {nullptr, 0}, {nullptr, 0}, - {&PartMappings[3], 1}, - {&PartMappings[4], 1}, - {&PartMappings[5], 1}, - {&PartMappings[6], 1}, - {&PartMappings[7], 1}, - {&PartMappings[8], 1}, + {&PartMappings[3], 1}, // 16 + {&PartMappings[4], 1}, // 32 + {&PartMappings[5], 1}, // 64 + {&PartMappings[6], 1}, // 128 + {&PartMappings[7], 1}, // 256 + {&PartMappings[8], 1}, // 512 + {&PartMappings[9], 1}, // 1024 - // VGPRs - {&PartMappings[9], 1}, + // VGPRs + {&PartMappings[10], 1}, // 1 {nullptr, 0}, {nullptr, 0}, {nullptr, 0}, - {&PartMappings[10], 1}, - {&PartMappings[11], 1}, - {&PartMappings[12], 1}, - {&PartMappings[13], 1}, - {&PartMappings[14], 1}, - {&PartMappings[15], 1}, - {&PartMappings[16], 1}, - {&PartMappings[17], 1} + {&PartMappings[11], 1}, // 16 + {&PartMappings[12], 1}, // 32 + {&PartMappings[13], 1}, // 64 + {&PartMappings[14], 1}, // 128 + {&PartMappings[15], 1}, // 256 + {&PartMappings[16], 1}, // 512 + {&PartMappings[17], 1}, // 1024 + {&PartMappings[18], 1}, + {&PartMappings[19], 1} }; const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] { - /*32-bit op*/ {0, 32, SGPRRegBank}, - /*2x32-bit op*/ {0, 32, SGPRRegBank}, - {32, 32, SGPRRegBank}, -/*<2x32-bit> op*/ {0, 64, SGPRRegBank}, + {0, 32, SGPRRegBank}, // 32-bit op + {0, 32, SGPRRegBank}, // 2x32-bit op + {32, 32, SGPRRegBank}, + {0, 64, SGPRRegBank}, // <2x32-bit> op - /*32-bit op*/ {0, 32, VGPRRegBank}, - /*2x32-bit op*/ {0, 32, VGPRRegBank}, - {32, 32, VGPRRegBank}, + {0, 32, VGPRRegBank}, // 32-bit op + {0, 32, VGPRRegBank}, // 2x32-bit op + {32, 32, VGPRRegBank}, }; @@ -116,7 +122,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] { enum ValueMappingIdx { SCCStartIdx = 0, SGPRStartIdx = 2, - VGPRStartIdx = 12 + VGPRStartIdx = 13 }; const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b31de0af5018..9f5bcd8ff5f0 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -218,12 +218,13 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF, assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL); - unsigned MaxKernArgAlign; + Align MaxKernArgAlign; HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, MaxKernArgAlign); HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u); + HSACodeProps.mKernargSegmentAlign = + std::max(MaxKernArgAlign, Align(4)).value(); HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; @@ -883,7 +884,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, auto Kern = HSAMetadataDoc->getMapNode(); - unsigned MaxKernArgAlign; + Align MaxKernArgAlign; Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode( STM.getKernArgSegmentSize(F, MaxKernArgAlign)); Kern[".group_segment_fixed_size"] = @@ -891,7 +892,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, Kern[".private_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.ScratchSize); Kern[".kernarg_segment_align"] = - Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign)); + Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value()); Kern[".wavefront_size"] = Kern.getDocument()->getNode(STM.getWavefrontSize()); Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 2eecddbd7b01..80ac8ca67bcd 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -52,7 +52,7 @@ public: class MetadataStreamerV3 final : public MetadataStreamer { private: std::unique_ptr<msgpack::Document> HSAMetadataDoc = - llvm::make_unique<msgpack::Document>(); + std::make_unique<msgpack::Document>(); void dump(StringRef HSAMetadataString) const; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ea730539f834..f330bd7ebcdd 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -172,8 +172,9 @@ private: MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; - SDNode *glueCopyToM0LDSInit(SDNode *N) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; + SDNode *glueCopyToM0LDSInit(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); @@ -186,10 +187,11 @@ private: bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -202,21 +204,20 @@ private: bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; + template <bool IsSigned> + bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset, SDValue &SLC) const; bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; - template <bool IsSigned> - bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &Offset, SDValue &SLC) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; SDValue Expand32BitAddress(SDValue Addr) const; @@ -262,6 +263,8 @@ private: SDValue getHi16Elt(SDValue In) const; + SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; + void SelectADD_SUB_I64(SDNode *N); void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); @@ -282,6 +285,7 @@ private: void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectINTRINSIC_W_CHAIN(SDNode *N); + void SelectINTRINSIC_WO_CHAIN(SDNode *N); void SelectINTRINSIC_VOID(SDNode *N); protected: @@ -543,7 +547,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, if (!N->isMachineOpcode()) { if (N->getOpcode() == ISD::CopyToReg) { unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); return MRI.getRegClass(Reg); } @@ -582,19 +586,10 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { - const SITargetLowering& Lowering = - *static_cast<const SITargetLowering*>(getTargetLowering()); - - assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); - - SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), - Val); - - SDValue Glue = M0.getValue(1); - +SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain, + SDValue Glue) const { SmallVector <SDValue, 8> Ops; - Ops.push_back(M0); // Replace the chain. + Ops.push_back(NewChain); // Replace the chain. for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) Ops.push_back(N->getOperand(i)); @@ -602,6 +597,16 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); + + SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val); + return glueCopyToOp(N, M0, M0.getValue(1)); +} + SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); if (AS == AMDGPUAS::LOCAL_ADDRESS) { @@ -635,13 +640,13 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: - return AMDGPU::SReg_32_XM0RegClassID; + return AMDGPU::SReg_32RegClassID; case 2: return AMDGPU::SReg_64RegClassID; case 3: return AMDGPU::SGPR_96RegClassID; case 4: - return AMDGPU::SReg_128RegClassID; + return AMDGPU::SGPR_128RegClassID; case 5: return AMDGPU::SGPR_160RegClassID; case 8: @@ -713,12 +718,17 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { return; // Already selected. } - if (isa<AtomicSDNode>(N) || + // isa<MemSDNode> almost works but is slightly too permissive for some DS + // intrinsics. + if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { N = glueCopyToM0LDSInit(N); + SelectCode(N); + return; + } switch (Opc) { default: @@ -781,7 +791,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue RC, SubReg0, SubReg1; SDLoc DL(N); if (N->getValueType(0) == MVT::i128) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); + RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); } else if (N->getValueType(0) == MVT::i64) { @@ -815,14 +825,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } - case ISD::LOAD: - case ISD::STORE: - case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: { - N = glueCopyToM0LDSInit(N); - break; - } - case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { // There is a scalar version available, but unlike the vector version which @@ -908,6 +910,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectINTRINSIC_W_CHAIN(N); return; } + case ISD::INTRINSIC_WO_CHAIN: { + SelectINTRINSIC_WO_CHAIN(N); + return; + } case ISD::INTRINSIC_VOID: { SelectINTRINSIC_VOID(N); return; @@ -961,6 +967,14 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, + const SDLoc &DL) const { + SDNode *Mov = CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(Val, DL, MVT::i32)); + return SDValue(Mov, 0); +} + // FIXME: Should only handle addcarry/subcarry void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); @@ -1308,7 +1322,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1321,6 +1336,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1400,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, - SDValue &DLC) const { + SDValue &DLC, SDValue &SWZ) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1408,7 +1424,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); @@ -1430,9 +1446,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1557,13 +1573,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && @@ -1585,16 +1602,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE, DLC; + SDValue GLC, SLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); +} + +// Find a load or store from corresponding pattern root. +// Roots may be build_vector, bitconvert or their combinations. +static MemSDNode* findMemSDNode(SDNode *N) { + N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); + if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) + return MN; + assert(isa<BuildVectorSDNode>(N)); + for (SDValue V : N->op_values()) + if (MemSDNode *MN = + dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) + return MN; + llvm_unreachable("cannot find MemSDNode in the pattern!"); } template <bool IsSigned> @@ -1603,8 +1634,95 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return static_cast<const SITargetLowering*>(getTargetLowering())-> - SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC); + int64_t OffsetVal = 0; + + if (Subtarget->hasFlatInstOffsets() && + (!Subtarget->hasFlatSegmentOffsetBug() || + findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && + CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { + Addr = N0; + OffsetVal = COffsetVal; + } else { + // If the offset doesn't fit, put the low bits into the offset field and + // add the rest. + + SDLoc DL(N); + uint64_t ImmField; + const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); + if (IsSigned) { + ImmField = SignExtend64(COffsetVal, NumBits); + + // Don't use a negative offset field if the base offset is positive. + // Since the scheduler currently relies on the offset field, doing so + // could result in strange scheduling decisions. + + // TODO: Should we not do this in the opposite direction as well? + if (static_cast<int64_t>(COffsetVal) > 0) { + if (static_cast<int64_t>(ImmField) < 0) { + const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1); + ImmField = COffsetVal & OffsetMask; + } + } + } else { + // TODO: Should we do this for a negative offset? + const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); + ImmField = COffsetVal & OffsetMask; + } + + uint64_t RemainderOffset = COffsetVal - ImmField; + + assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); + assert(RemainderOffset + ImmField == COffsetVal); + + OffsetVal = ImmField; + + // TODO: Should this try to use a scalar add pseudo if the base address is + // uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetLo + = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue AddOffsetHi + = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + SDNode *Add = CurDAG->getMachineNode( + AMDGPU::V_ADD_I32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 + }; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), 0); + } + } + + VAddr = Addr; + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + return true; } bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, @@ -1616,10 +1734,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, } bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { + SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); } @@ -2158,10 +2276,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { // offset field) % 64. Some versions of the programming guide omit the m0 // part, or claim it's from offset 0. if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) { - // If we have a constant offset, try to use the default value for m0 as a - // base to possibly avoid setting it up. - glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32)); - ImmOffset = ConstOffset->getZExtValue() + 1; + // If we have a constant offset, try to use the 0 in m0 as the base. + // TODO: Look into changing the default m0 initialization value. If the + // default -1 only set the low 16-bits, we could leave it as-is and add 1 to + // the immediate offset. + glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32)); + ImmOffset = ConstOffset->getZExtValue(); } else { if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { ImmOffset = BaseOffset.getConstantOperandVal(1); @@ -2182,22 +2302,7 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { glueCopyToM0(N, SDValue(M0Base, 0)); } - SDValue V0; SDValue Chain = N->getOperand(0); - SDValue Glue; - if (HasVSrc) { - SDValue VSrc0 = N->getOperand(2); - - // The manual doesn't mention this, but it seems only v0 works. - V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); - - SDValue CopyToV0 = CurDAG->getCopyToReg( - N->getOperand(0), SL, V0, VSrc0, - N->getOperand(N->getNumOperands() - 1)); - Chain = CopyToV0; - Glue = CopyToV0.getValue(1); - } - SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); // TODO: Can this just be removed from the instruction? @@ -2206,14 +2311,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { const unsigned Opc = gwsIntrinToOpcode(IntrID); SmallVector<SDValue, 5> Ops; if (HasVSrc) - Ops.push_back(V0); + Ops.push_back(N->getOperand(2)); Ops.push_back(OffsetField); Ops.push_back(GDS); Ops.push_back(Chain); - if (HasVSrc) - Ops.push_back(Glue); - SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); } @@ -2233,6 +2335,28 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { SelectCode(N); } +void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + unsigned Opcode; + switch (IntrID) { + case Intrinsic::amdgcn_wqm: + Opcode = AMDGPU::WQM; + break; + case Intrinsic::amdgcn_softwqm: + Opcode = AMDGPU::SOFT_WQM; + break; + case Intrinsic::amdgcn_wwm: + Opcode = AMDGPU::WWM; + break; + default: + SelectCode(N); + return; + } + + SDValue Src = N->getOperand(1); + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); +} + void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); switch (IntrID) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 39016ed37193..1115d8c23620 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -12,10 +12,6 @@ // //===----------------------------------------------------------------------===// -#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f -#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f -#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f - #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" @@ -37,82 +33,9 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; -static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - const TargetRegisterClass *RC, - unsigned NumRegs) { - ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); - unsigned RegResult = State.AllocateReg(RegList); - if (RegResult == AMDGPU::NoRegister) - return false; - - State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); - return true; -} - -static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::i64: - case MVT::f64: - case MVT::v2i32: - case MVT::v2f32: - case MVT::v4i16: - case MVT::v4f16: { - // Up to SGPR0-SGPR105 - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::SGPR_64RegClass, 53); - } - default: - return false; - } -} - -// Allocate up to VGPR31. -// -// TODO: Since there are no VGPR alignent requirements would it be better to -// split into individual scalar registers? -static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::i64: - case MVT::f64: - case MVT::v2i32: - case MVT::v2f32: - case MVT::v4i16: - case MVT::v4f16: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_64RegClass, 31); - } - case MVT::v4i32: - case MVT::v4f32: - case MVT::v2i64: - case MVT::v2f64: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_128RegClass, 29); - } - case MVT::v8i32: - case MVT::v8f32: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_256RegClass, 25); - - } - case MVT::v16i32: - case MVT::v16f32: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_512RegClass, 17); - - } - default: - return false; - } -} - #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); } - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); @@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); @@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); @@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); + setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FABS); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::AssertSext); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); } //===----------------------------------------------------------------------===// @@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) { case ISD::FREM: case ISD::INLINEASM: case ISD::INLINEASM_BR: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: + case ISD::INTRINSIC_W_CHAIN: // TODO: Should really be looking at the users of the bitcast. These are // problematic because bitcasts are used to legalize all stores to integer // types. case ISD::BITCAST: return false; + case ISD::INTRINSIC_WO_CHAIN: { + switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) { + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: + return false; + default: + return true; + } + } default: return true; } @@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, return false; bool Fast = false; - return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, - MMO, &Fast) && Fast; + return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + CastTy, MMO, &Fast) && + Fast; } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { break; case ISD::LOAD: { - const LoadSDNode * L = dyn_cast<LoadSDNode>(N); - if (L->getMemOperand()->getAddrSpace() - == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == + AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; return false; } @@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::FLOG: - return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); + return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef); case ISD::FLOG10: - return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); + return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); case ISD::FEXP: return lowerFEXP(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); @@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, } } -static bool hasDefinedInitializer(const GlobalValue *GV) { +bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); if (!GVar || !GVar->hasInitializer()) return false; @@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); } -// Return M_LOG2E of appropriate type -static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) { - switch (VT.getScalarType().getSimpleVT().SimpleTy) { - case MVT::f32: - return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT); - case MVT::f16: - return DAG.getConstantFP( - APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"), - SL, VT); - case MVT::f64: - return DAG.getConstantFP( - APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT); - default: - llvm_unreachable("unsupported fp type"); - } -} - // exp2(M_LOG2E_F * f); SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); SDValue Src = Op.getOperand(0); - const SDValue K = getLog2EVal(DAG, SL, VT); + const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT); SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); } @@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { static SDValue simplifyI24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - SDValue LHS = Node24->getOperand(0); - SDValue RHS = Node24->getOperand(1); + bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; + + SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); + SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); + unsigned NewOpcode = Node24->getOpcode(); + if (IsIntrin) { + unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue(); + NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ? + AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + } APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); @@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24, SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); if (DemandedLHS || DemandedRHS) - return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(), + return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), DemandedLHS ? DemandedLHS : LHS, DemandedRHS ? DemandedRHS : RHS); @@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { return true; } -// Find a load or store from corresponding pattern root. -// Roots may be build_vector, bitconvert or their combinations. -static MemSDNode* findMemSDNode(SDNode *N) { - N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); - if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) - return MN; - assert(isa<BuildVectorSDNode>(N)); - for (SDValue V : N->op_values()) - if (MemSDNode *MN = - dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) - return MN; - llvm_unreachable("cannot find MemSDNode in the pattern!"); -} - -bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned, - SelectionDAG &DAG, - SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { - const GCNSubtarget &ST = - DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); - int64_t OffsetVal = 0; - - if (ST.hasFlatInstOffsets() && - (!ST.hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && - DAG.isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - - const SIInstrInfo *TII = ST.getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(), - IsSigned)) { - Addr = N0; - OffsetVal = COffsetVal; - } - } - - VAddr = Addr; - Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16); - SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1); - - return true; -} - // Replace load of an illegal type with a store of a bitcast to a friendlier // type. SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, @@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, return SDValue(); } + +SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mul_i24: + case Intrinsic::amdgcn_mul_u24: + return simplifyI24(N, DCI); + default: + return SDValue(); + } +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: + return performIntrinsicWOChainCombine(N, DCI); } return SDValue(); } @@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +// This may be called multiple times, and nothing prevents creating multiple +// objects at the same offset. See if we already defined this object. +static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, + int64_t Offset) { + for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { + if (MFI.getObjectOffset(I) == Offset) { + assert(MFI.getObjectSize(I) == Size); + return I; + } + } + + return MFI.CreateFixedObject(Size, Offset, true); +} + SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); - int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); @@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); - unsigned Alignment = ST.getAlignmentForImplicitArgPtr(); + const Align Alignment = ST.getAlignmentForImplicitArgPtr(); uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + ExplicitArgOffset; switch (Param) { @@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(DENORM_MODE) NODE_NAME_CASE(FMA_W_CHAIN) NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) @@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(INIT_EXEC) - NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) - NODE_NAME_CASE(SENDMSG) - NODE_NAME_CASE(SENDMSGHALT) - NODE_NAME_CASE(INTERP_MOV) - NODE_NAME_CASE(INTERP_P1) - NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(INTERP_P1LL_F16) NODE_NAME_CASE(INTERP_P1LV_F16) NODE_NAME_CASE(INTERP_P2_F16) @@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_AND) NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_INC) + NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) @@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; } else if (SelBits == 0x0c) { - Known.Zero |= 0xff << I; + Known.Zero |= 0xFFull << I; } else if (SelBits > 0x0c) { - Known.One |= 0xff << I; + Known.One |= 0xFFull << I; } Sel >>= 8; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fe7ad694943d..dea0d1d4343a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -38,6 +38,7 @@ private: public: static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG); static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); + static bool hasDefinedInitializer(const GlobalValue *GV); protected: SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; @@ -78,6 +79,7 @@ protected: SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, @@ -324,10 +326,6 @@ public: } AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - - bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N, - SDValue Addr, SDValue &VAddr, SDValue &Offset, - SDValue &SLC) const; }; namespace AMDGPUISD { @@ -369,6 +367,9 @@ enum NodeType : unsigned { // result bit per item in the wavefront. SETCC, SETREG, + + DENORM_MODE, + // FP ops with input and output chain. FMA_W_CHAIN, FMUL_W_CHAIN, @@ -475,13 +476,6 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, - INIT_EXEC, - INIT_EXEC_FROM_INPUT, - SENDMSG, - SENDMSGHALT, - INTERP_MOV, - INTERP_P1, - INTERP_P2, INTERP_P1LL_F16, INTERP_P1LV_F16, INTERP_P2_F16, @@ -532,6 +526,8 @@ enum NodeType : unsigned { BUFFER_ATOMIC_AND, BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, + BUFFER_ATOMIC_INC, + BUFFER_ATOMIC_DEC, BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_PK_FADD, diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index f4df20b8f03e..a83ec23ec054 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -51,7 +51,7 @@ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), // Inliner constraint to achieve reasonable compilation time static cl::opt<size_t> -MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300), +MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum BB number allowed in a function after inlining" " (compile time constraint)")); diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 4a8446955496..cf0ce5659951 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -110,39 +110,38 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; // Force dependencies for vector trunc stores def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>; -def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; -def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; - +def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; // out = a - floor(a) -def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; // out = 1.0 / a -def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; +def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) -def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; +def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) -def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; -def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; +def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; +def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. -def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; +def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; -def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; +def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; -def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; -def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; -def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; -def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; -def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; +def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; +def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; +def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; -def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; +def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; // out = max(a, b) a and b are floats, where a nan comparison fails. // This is not commutative because this gives the second operand: @@ -285,7 +284,7 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; -def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>; @@ -320,7 +319,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, [] >; -def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, @@ -330,35 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; -def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT", - SDTypeProfile<0, 2, - [SDTCisInt<0>, SDTCisInt<1>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", - SDTypeProfile<1, 4, [SDTCisFP<0>]>, - [SDNPInGlue]>; - def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", SDTypeProfile<1, 7, [SDTCisFP<0>]>, [SDNPInGlue, SDNPOutGlue]>; @@ -425,3 +395,65 @@ def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; + + +//===----------------------------------------------------------------------===// +// Intrinsic/Custom node compatability PatFrags +//===----------------------------------------------------------------------===// + +def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src), + (AMDGPUrcp_impl node:$src)]>; +def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src), + (AMDGPUrcp_legacy_impl node:$src)]>; + +def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src), + (AMDGPUrsq_legacy_impl node:$src)]>; + +def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src), + (AMDGPUrsq_impl node:$src)]>; + +def AMDGPUrsq_clamp : PatFrags<(ops node:$src), [(int_amdgcn_rsq_clamp node:$src), + (AMDGPUrsq_clamp_impl node:$src)]>; + +def AMDGPUsin : PatFrags<(ops node:$src), [(int_amdgcn_sin node:$src), + (AMDGPUsin_impl node:$src)]>; +def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src), + (AMDGPUcos_impl node:$src)]>; +def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src), + (AMDGPUfract_impl node:$src)]>; + +def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_ldexp node:$src0, node:$src1), + (AMDGPUldexp_impl node:$src0, node:$src1)]>; + +def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_class node:$src0, node:$src1), + (AMDGPUfp_class_impl node:$src0, node:$src1)]>; + +def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2), + (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUffbh_i32 : PatFrags<(ops node:$src), + [(int_amdgcn_sffbh node:$src), + (AMDGPUffbh_i32_impl node:$src)]>; + +def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1), + (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>; + +def AMDGPUpknorm_i16_f32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pknorm_i16 node:$src0, node:$src1), + (AMDGPUpknorm_i16_f32_impl node:$src0, node:$src1)]>; + +def AMDGPUpknorm_u16_f32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pknorm_u16 node:$src0, node:$src1), + (AMDGPUpknorm_u16_f32_impl node:$src0, node:$src1)]>; + +def AMDGPUpk_i16_i32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pk_i16 node:$src0, node:$src1), + (AMDGPUpk_i16_i32_impl node:$src0, node:$src1)]>; + +def AMDGPUpk_u16_u32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pk_u16 node:$src0, node:$src1), + (AMDGPUpk_u16_u32_impl node:$src0, node:$src1)]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 901a2eaa8829..3cfa9d57ec46 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -19,8 +19,10 @@ #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -61,8 +63,14 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } +void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) { + MRI = &MF.getRegInfo(); + InstructionSelector::setupMF(MF, KB, CoverageInfo); +} + static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return Reg == AMDGPU::SCC; auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); @@ -71,7 +79,9 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { if (RC) { // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the // context of the register bank has been lost. - if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID) + // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which + // won't ever beconstrained any further. + if (RC != &AMDGPU::SGPR_32RegClass) return false; const LLT Ty = MRI.getType(Reg); return Ty.isValid() && Ty.getSizeInBits() == 1; @@ -83,7 +93,7 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { bool AMDGPUInstructionSelector::isVCC(Register Reg, const MachineRegisterInfo &MRI) const { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return Reg == TRI.getVCC(); auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); @@ -102,8 +112,6 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); I.setDesc(TII.get(TargetOpcode::COPY)); const MachineOperand &Src = I.getOperand(1); @@ -111,33 +119,33 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { Register DstReg = Dst.getReg(); Register SrcReg = Src.getReg(); - if (isVCC(DstReg, MRI)) { + if (isVCC(DstReg, *MRI)) { if (SrcReg == AMDGPU::SCC) { const TargetRegisterClass *RC - = TRI.getConstrainedRegClassForOperand(Dst, MRI); + = TRI.getConstrainedRegClassForOperand(Dst, *MRI); if (!RC) return true; - return RBI.constrainGenericRegister(DstReg, *RC, MRI); + return RBI.constrainGenericRegister(DstReg, *RC, *MRI); } - if (!isVCC(SrcReg, MRI)) { + if (!isVCC(SrcReg, *MRI)) { // TODO: Should probably leave the copy and let copyPhysReg expand it. - if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI)) + if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) return false; BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) .addImm(0) .addReg(SrcReg); - if (!MRI.getRegClassOrNull(SrcReg)) - MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI)); + if (!MRI->getRegClassOrNull(SrcReg)) + MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI)); I.eraseFromParent(); return true; } const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(Dst, MRI); - if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI)) + TRI.getConstrainedRegClassForOperand(Dst, *MRI); + if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) return false; // Don't constrain the source register to a class so the def instruction @@ -148,8 +156,8 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { // with size 1. An SReg_32 with size 1 is ambiguous with wave32. if (Src.isUndef()) { const TargetRegisterClass *SrcRC = - TRI.getConstrainedRegClassForOperand(Src, MRI); - if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + TRI.getConstrainedRegClassForOperand(Src, *MRI); + if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) return false; } @@ -157,30 +165,26 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { } for (const MachineOperand &MO : I.operands()) { - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) continue; const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); + TRI.getConstrainedRegClassForOperand(MO, *MRI); if (!RC) continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); } return true; } bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const Register DefReg = I.getOperand(0).getReg(); - const LLT DefTy = MRI.getType(DefReg); + const LLT DefTy = MRI->getType(DefReg); // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) const RegClassOrRegBank &RegClassOrBank = - MRI.getRegClassOrRegBank(DefReg); + MRI->getRegClassOrRegBank(DefReg); const TargetRegisterClass *DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); @@ -196,7 +200,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { return false; } - DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI); + DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); return false; @@ -204,7 +208,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { } I.setDesc(TII.get(TargetOpcode::PHI)); - return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); + return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); } MachineOperand @@ -214,13 +218,11 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, MachineInstr *MI = MO.getParent(); MachineBasicBlock *BB = MO.getParent()->getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register DstReg = MRI.createVirtualRegister(&SubRC); + Register DstReg = MRI->createVirtualRegister(&SubRC); if (MO.isReg()) { unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) .addReg(Reg, 0, ComposedSubIdx); @@ -244,10 +246,6 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, } } -static int64_t getConstant(const MachineInstr *MI) { - return MI->getOperand(1).getCImm()->getSExtValue(); -} - static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { switch (Opc) { case AMDGPU::G_AND: @@ -262,16 +260,13 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { } bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &Dst = I.getOperand(0); MachineOperand &Src0 = I.getOperand(1); MachineOperand &Src1 = I.getOperand(2); Register DstReg = Dst.getReg(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); if (DstRB->getID() == AMDGPU::VCCRegBankID) { const TargetRegisterClass *RC = TRI.getBoolRC(); unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), @@ -282,12 +277,12 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { // The selector for G_ICMP relies on seeing the register bank for the result // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will // be ambiguous whether it's a scalar or vector bool. - if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg())) - MRI.setRegClass(Src0.getReg(), RC); - if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg())) - MRI.setRegClass(Src1.getReg(), RC); + if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) + MRI->setRegClass(Src0.getReg(), RC); + if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) + MRI->setRegClass(Src1.getReg(), RC); - return RBI.constrainGenericRegister(DstReg, *RC, MRI); + return RBI.constrainGenericRegister(DstReg, *RC, *MRI); } // TODO: Should this allow an SCC bank result, and produce a copy from SCC for @@ -295,14 +290,7 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { if (DstRB->getID() == AMDGPU::SGPRRegBankID) { unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); I.setDesc(TII.get(InstOpc)); - - const TargetRegisterClass *RC - = TRI.getConstrainedRegClassForOperand(Dst, MRI); - if (!RC) - return false; - return RBI.constrainGenericRegister(DstReg, *RC, MRI) && - RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) && - RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } return false; @@ -311,11 +299,10 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); Register DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; @@ -340,7 +327,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; - Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass()); + Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); MachineInstr *Add = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) .addDef(UnusedCarry, RegState::Dead) @@ -363,8 +350,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); - Register DstLo = MRI.createVirtualRegister(&HalfRC); - Register DstHi = MRI.createVirtualRegister(&HalfRC); + Register DstLo = MRI->createVirtualRegister(&HalfRC); + Register DstHi = MRI->createVirtualRegister(&HalfRC); if (IsSALU) { BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) @@ -375,14 +362,14 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .add(Hi2); } else { const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); - Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register CarryReg = MRI->createVirtualRegister(CarryRC); BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) .addDef(CarryReg) .add(Lo1) .add(Lo2) .addImm(0); MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) - .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead) + .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) .add(Hi1) .add(Hi2) .addReg(CarryReg, RegState::Kill) @@ -399,19 +386,61 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .addImm(AMDGPU::sub1); - if (!RBI.constrainGenericRegister(DstReg, RC, MRI)) + if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) return false; I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - assert(I.getOperand(2).getImm() % 32 == 0); - unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); + const DebugLoc &DL = I.getDebugLoc(); + Register Dst0Reg = I.getOperand(0).getReg(); + Register Dst1Reg = I.getOperand(1).getReg(); + const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO; + + if (!isSCC(Dst1Reg, MRI)) { + // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned + // carry out despite the _i32 name. These were renamed in VI to _U32. + // FIXME: We should probably rename the opcodes here. + unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + I.setDesc(TII.get(NewOpc)); + I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + I.addOperand(*MF, MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + Register Src0Reg = I.getOperand(2).getReg(); + Register Src1Reg = I.getOperand(3).getReg(); + unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) + .addReg(AMDGPU::SCC); + + if (!MRI.getRegClassOrNull(Dst1Reg)) + MRI.setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); + + if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, MRI) || + !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, MRI) || + !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, MRI)) + return false; + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 32 != 0) + return false; + + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); const DebugLoc &DL = I.getDebugLoc(); MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), I.getOperand(0).getReg()) @@ -419,10 +448,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { for (const MachineOperand &MO : Copy->operands()) { const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); + TRI.getConstrainedRegClassForOperand(MO, *MRI); if (!RC) continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); } I.eraseFromParent(); return true; @@ -430,21 +459,19 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + LLT DstTy = MRI->getType(DstReg); + LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); const unsigned SrcSize = SrcTy.getSizeInBits(); if (SrcSize < 32) return false; const DebugLoc &DL = MI.getDebugLoc(); - const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const unsigned DstSize = DstTy.getSizeInBits(); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); if (!DstRC) return false; @@ -457,12 +484,12 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MIB.addImm(SubRegs[I]); const TargetRegisterClass *SrcRC - = TRI.getConstrainedRegClassForOperand(Src, MRI); - if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI)) + = TRI.getConstrainedRegClassForOperand(Src, *MRI); + if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) return false; } - if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; MI.eraseFromParent(); @@ -471,25 +498,23 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const int NumDst = MI.getNumOperands() - 1; MachineOperand &Src = MI.getOperand(NumDst); Register SrcReg = Src.getReg(); Register DstReg0 = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg0); - LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI->getType(DstReg0); + LLT SrcTy = MRI->getType(SrcReg); const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); const DebugLoc &DL = MI.getDebugLoc(); - const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); const TargetRegisterClass *SrcRC = - TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI); - if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); + if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) return false; const unsigned SrcFlags = getUndefRegState(Src.isUndef()); @@ -504,8 +529,8 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { .addReg(SrcReg, SrcFlags, SubRegs[I]); const TargetRegisterClass *DstRC = - TRI.getConstrainedRegClassForOperand(Dst, MRI); - if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI)) + TRI.getConstrainedRegClassForOperand(Dst, *MRI); + if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) return false; } @@ -518,16 +543,13 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { } bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const MachineOperand &MO = I.getOperand(0); // FIXME: Interface for getConstrainedRegClassForOperand needs work. The // regbank check here is to know why getConstrainedRegClassForOperand failed. - const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI); - if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) || - (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) { + const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); + if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || + (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); return true; } @@ -537,44 +559,62 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); - DebugLoc DL = I.getDebugLoc(); - MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) - .addDef(I.getOperand(0).getReg()) - .addReg(I.getOperand(1).getReg()) - .addReg(I.getOperand(2).getReg()) - .addImm(SubReg); - for (const MachineOperand &MO : Ins->operands()) { - if (!MO.isReg()) - continue; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) - continue; + Register DstReg = I.getOperand(0).getReg(); + Register Src0Reg = I.getOperand(1).getReg(); + Register Src1Reg = I.getOperand(2).getReg(); + LLT Src1Ty = MRI->getType(Src1Reg); + + unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); + unsigned InsSize = Src1Ty.getSizeInBits(); + + int64_t Offset = I.getOperand(3).getImm(); + if (Offset % 32 != 0) + return false; + + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); + if (SubReg == AMDGPU::NoSubRegister) + return false; + + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + if (!DstRC) + return false; + + const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); + const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); + const TargetRegisterClass *Src0RC = + TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); + const TargetRegisterClass *Src1RC = + TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); + + // Deal with weird cases where the class only partially supports the subreg + // index. + Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); + if (!Src0RC) + return false; + + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || + !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) + return false; + + const DebugLoc &DL = I.getDebugLoc(); + BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) + .addReg(Src0Reg) + .addReg(Src1Reg) + .addImm(SubReg); - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); - if (!RC) - continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); - } I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectG_INTRINSIC( - MachineInstr &I, CodeGenCoverage &CoverageInfo) const { - unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); +bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { + unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { - case Intrinsic::maxnum: - case Intrinsic::minnum: - case Intrinsic::amdgcn_cvt_pkrtz: - return selectImpl(I, CoverageInfo); case Intrinsic::amdgcn_if_break: { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. @@ -589,15 +629,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC( I.eraseFromParent(); - for (Register Reg : { DstReg, Src0Reg, Src1Reg }) { - if (!MRI.getRegClassOrNull(Reg)) - MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); - } + for (Register Reg : { DstReg, Src0Reg, Src1Reg }) + MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); return true; } default: - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); } } @@ -677,17 +715,15 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = I.getDebugLoc(); - unsigned SrcReg = I.getOperand(2).getReg(); - unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI); + Register SrcReg = I.getOperand(2).getReg(); + unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); - unsigned CCReg = I.getOperand(0).getReg(); - if (isSCC(CCReg, MRI)) { + Register CCReg = I.getOperand(0).getReg(); + if (isSCC(CCReg, *MRI)) { int Opcode = getS_CMPOpcode(Pred, Size); if (Opcode == -1) return false; @@ -698,7 +734,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { .addReg(AMDGPU::SCC); bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && - RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI); + RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); I.eraseFromParent(); return Ret; } @@ -712,7 +748,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { .add(I.getOperand(2)) .add(I.getOperand(3)); RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), - *TRI.getBoolRC(), MRI); + *TRI.getBoolRC(), *MRI); bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); I.eraseFromParent(); return Ret; @@ -736,19 +772,273 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, .addImm(Enabled); } +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) + return true; + + // FIXME: matcher should ignore copies + return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; +} + +static unsigned extractGLC(unsigned AuxiliaryData) { + return AuxiliaryData & 1; +} + +static unsigned extractSLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 1) & 1; +} + +static unsigned extractDLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 2) & 1; +} + +static unsigned extractSWZ(unsigned AuxiliaryData) { + return (AuxiliaryData >> 3) & 1; +} + +// Returns Base register, constant offset, and offset def point. +static std::tuple<Register, unsigned, MachineInstr *> +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (!Def) + return std::make_tuple(Reg, 0, nullptr); + + if (Def->getOpcode() == AMDGPU::G_CONSTANT) { + unsigned Offset; + const MachineOperand &Op = Def->getOperand(1); + if (Op.isImm()) + Offset = Op.getImm(); + else + Offset = Op.getCImm()->getZExtValue(); + + return std::make_tuple(Register(), Offset, Def); + } + + int64_t Offset; + if (Def->getOpcode() == AMDGPU::G_ADD) { + // TODO: Handle G_OR used for add case + if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset))) + return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); + + // FIXME: matcher should ignore copies + if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset)))) + return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); + } + + return std::make_tuple(Reg, 0, Def); +} + +static unsigned getBufferStoreOpcode(LLT Ty, + const unsigned MemSize, + const bool Offen) { + const int Size = Ty.getSizeInBits(); + switch (8 * MemSize) { + case 8: + return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + case 16: + return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + default: + unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + return Opc; + } +} + +static unsigned getBufferStoreFormatOpcode(LLT Ty, + const unsigned MemSize, + const bool Offen) { + bool IsD16Packed = Ty.getScalarSizeInBits() == 16; + bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); + int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; + + if (IsD16Packed) { + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; + default: + return -1; + } + } + + if (IsD16Unpacked) { + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; + default: + return -1; + } + } + + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; + default: + return -1; + } + + llvm_unreachable("unhandled buffer store"); +} + +// TODO: Move this to combiner +// Returns base register, imm offset, total constant offset. +std::tuple<Register, unsigned, unsigned> +AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned TotalConstOffset; + MachineInstr *OffsetDef; + + std::tie(BaseReg, TotalConstOffset, OffsetDef) + = getBaseWithConstantOffset(*MRI, OrigOffset); + + unsigned ImmOffset = TotalConstOffset; + + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store.f + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + if (Overflow != 0) { + // In case this is in a waterfall loop, insert offset code at the def point + // of the offset, not inside the loop. + MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); + MachineBasicBlock &OldMBB = B.getMBB(); + B.setInstr(*OffsetDef); + + if (!BaseReg) { + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(BaseReg) + .addImm(Overflow); + } else { + Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(OverflowVal) + .addImm(Overflow); + + Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) + .addReg(BaseReg) + .addReg(OverflowVal, RegState::Kill) + .addImm(0); + BaseReg = NewBaseReg; + } + + B.setInsertPt(OldMBB, OldInsPt); + } + + return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); +} + +bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, + bool IsFormat) const { + MachineIRBuilder B(MI); + MachineFunction &MF = B.getMF(); + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI->getType(VData); + + int Size = Ty.getSizeInBits(); + if (Size % 32 != 0) + return false; + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned AuxiliaryData = MI.getOperand(5).getImm(); + unsigned ImmOffset; + unsigned TotalOffset; + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); + + const bool Offen = !isZero(VOffset, *MRI); + + int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : + getBufferStoreOpcode(Ty, MemSize, Offen); + if (Opc == -1) + return false; + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(AuxiliaryData)) + .addImm(extractSLC(AuxiliaryData)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(AuxiliaryData)) + .addImm(extractSWZ(AuxiliaryData)) + .addMemOperand(MMO); + + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( - MachineInstr &I, CodeGenCoverage &CoverageInfo) const { + MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - unsigned IntrinsicID = I.getOperand(0).getIntrinsicID(); + unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp: { - int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); - int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); - int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg())); - int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg())); + int64_t Tgt = I.getOperand(1).getImm(); + int64_t Enabled = I.getOperand(2).getImm(); + int64_t Done = I.getOperand(7).getImm(); + int64_t VM = I.getOperand(8).getImm(); MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), I.getOperand(4).getReg(), @@ -761,13 +1051,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( } case Intrinsic::amdgcn_exp_compr: { const DebugLoc &DL = I.getDebugLoc(); - int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); - int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); - unsigned Reg0 = I.getOperand(3).getReg(); - unsigned Reg1 = I.getOperand(4).getReg(); - unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg())); - int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg())); + int64_t Tgt = I.getOperand(1).getImm(); + int64_t Enabled = I.getOperand(2).getImm(); + Register Reg0 = I.getOperand(3).getReg(); + Register Reg1 = I.getOperand(4).getReg(); + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + int64_t Done = I.getOperand(5).getImm(); + int64_t VM = I.getOperand(6).getImm(); BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, @@ -786,27 +1076,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( Register Reg = I.getOperand(1).getReg(); I.eraseFromParent(); - if (!MRI.getRegClassOrNull(Reg)) - MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); + if (!MRI->getRegClassOrNull(Reg)) + MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); return true; } + case Intrinsic::amdgcn_raw_buffer_store: + return selectStoreIntrinsic(I, false); + case Intrinsic::amdgcn_raw_buffer_store_format: + return selectStoreIntrinsic(I, true); default: - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); } } bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = I.getDebugLoc(); - unsigned DstReg = I.getOperand(0).getReg(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + Register DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); assert(Size <= 32 || Size == 64); const MachineOperand &CCOp = I.getOperand(1); - unsigned CCReg = CCOp.getReg(); - if (isSCC(CCReg, MRI)) { + Register CCReg = CCOp.getReg(); + if (isSCC(CCReg, *MRI)) { unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) @@ -815,8 +1107,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { // The generic constrainSelectedInstRegOperands doesn't work for the scc register // bank, because it does not cover the register class that we used to represent // for it. So we need to manually set the register class here. - if (!MRI.getRegClassOrNull(CCReg)) - MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI)); + if (!MRI->getRegClassOrNull(CCReg)) + MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) .add(I.getOperand(2)) .add(I.getOperand(3)); @@ -845,52 +1137,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { } bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - DebugLoc DL = I.getDebugLoc(); - unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI); - if (PtrSize != 64) { - LLVM_DEBUG(dbgs() << "Unhandled address space\n"); - return false; - } - - unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); - unsigned Opcode; - - // FIXME: Remove this when integers > s32 naturally selected. - switch (StoreSize) { - default: - return false; - case 32: - Opcode = AMDGPU::FLAT_STORE_DWORD; - break; - case 64: - Opcode = AMDGPU::FLAT_STORE_DWORDX2; - break; - case 96: - Opcode = AMDGPU::FLAT_STORE_DWORDX3; - break; - case 128: - Opcode = AMDGPU::FLAT_STORE_DWORDX4; - break; - } - - MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) - .add(I.getOperand(1)) - .add(I.getOperand(0)) - .addImm(0) // offset - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // dlc - - - // Now that we selected an opcode, we need to constrain the register - // operands to use appropriate classes. - bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); - - I.eraseFromParent(); - return Ret; + initM0(I); + return selectImpl(I, *CoverageInfo); } static int sizeToSubRegIndex(unsigned Size) { @@ -915,19 +1163,15 @@ static int sizeToSubRegIndex(unsigned Size) { } bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); - const LLT DstTy = MRI.getType(DstReg); - const LLT SrcTy = MRI.getType(SrcReg); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const LLT DstTy = MRI->getType(DstReg); + const LLT SrcTy = MRI->getType(SrcReg); if (!DstTy.isScalar()) return false; - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); - const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); if (SrcRB != DstRB) return false; @@ -935,9 +1179,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { unsigned SrcSize = SrcTy.getSizeInBits(); const TargetRegisterClass *SrcRC - = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI); + = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); const TargetRegisterClass *DstRC - = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI); + = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); if (SrcSize > 32) { int SubRegIdx = sizeToSubRegIndex(DstSize); @@ -953,8 +1197,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { I.getOperand(1).setSubReg(SubRegIdx); } - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); return false; } @@ -974,20 +1218,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { bool Signed = I.getOpcode() == AMDGPU::G_SEXT; const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); - const LLT DstTy = MRI.getType(DstReg); - const LLT SrcTy = MRI.getType(SrcReg); + const LLT DstTy = MRI->getType(DstReg); + const LLT SrcTy = MRI->getType(SrcReg); const LLT S1 = LLT::scalar(1); const unsigned SrcSize = SrcTy.getSizeInBits(); const unsigned DstSize = DstTy.getSizeInBits(); if (!DstTy.isScalar()) return false; - const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); if (SrcBank->getID() == AMDGPU::SCCRegBankID) { if (SrcTy != S1 || DstSize > 64) // Invalid @@ -1000,7 +1242,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { // FIXME: Create an extra copy to avoid incorrectly constraining the result // of the scc producer. - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) .addReg(SrcReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) @@ -1010,7 +1252,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) .addImm(0) .addImm(Signed ? -1 : 1); - return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); } if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { @@ -1024,6 +1267,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addImm(0) // src1_modifiers .addImm(Signed ? -1 : 1) // src1 .addUse(SrcReg); + I.eraseFromParent(); return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); } @@ -1040,6 +1284,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) .addImm(Mask) .addReg(SrcReg); + I.eraseFromParent(); return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); } @@ -1049,11 +1294,12 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addReg(SrcReg) .addImm(0) // Offset .addImm(SrcSize); // Width + I.eraseFromParent(); return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); } if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { - if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI)) + if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) return false; if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { @@ -1061,7 +1307,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) .addReg(SrcReg); - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); } const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; @@ -1070,10 +1317,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. if (DstSize > 32 && SrcSize <= 32) { // We need a 64-bit register source, but the high bits don't matter. - unsigned ExtReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned UndefReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) .addReg(SrcReg) @@ -1085,7 +1330,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addReg(ExtReg) .addImm(SrcSize << 16); - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); } unsigned Mask; @@ -1099,16 +1345,58 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addImm(SrcSize << 16); } - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); } return false; } +static int64_t getFPTrueImmVal(unsigned Size, bool Signed) { + switch (Size) { + case 16: + return Signed ? 0xBC00 : 0x3C00; + case 32: + return Signed ? 0xbf800000 : 0x3f800000; + case 64: + return Signed ? 0xbff0000000000000 : 0x3ff0000000000000; + default: + llvm_unreachable("Invalid FP type size"); + } +} + +bool AMDGPUInstructionSelector::selectG_SITOFP_UITOFP(MachineInstr &I) const { + MachineBasicBlock *MBB = I.getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register Src = I.getOperand(1).getReg(); + if (!isSCC(Src, MRI)) + return selectImpl(I, *CoverageInfo); + + bool Signed = I.getOpcode() == AMDGPU::G_SITOFP; + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const unsigned DstSize = DstTy.getSizeInBits(); + const DebugLoc &DL = I.getDebugLoc(); + + BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(Src); + + unsigned NewOpc = + DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + auto MIB = BuildMI(*MBB, I, DL, TII.get(NewOpc), DstReg) + .addImm(0) + .addImm(getFPTrueImmVal(DstSize, Signed)); + + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &ImmOp = I.getOperand(1); // The AMDGPU backend only supports Imm operands and not CImm or FPImm. @@ -1119,15 +1407,15 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); } - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); unsigned Size; bool IsSgpr; - const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg()); + const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); if (RB) { IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; - Size = MRI.getType(DstReg).getSizeInBits(); + Size = MRI->getType(DstReg).getSizeInBits(); } else { - const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg); + const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); IsSgpr = TRI.isSGPRClass(RC); Size = TRI.getRegSizeInBits(*RC); } @@ -1142,34 +1430,41 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - DebugLoc DL = I.getDebugLoc(); - const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : - &AMDGPU::VGPR_32RegClass; - unsigned LoReg = MRI.createVirtualRegister(RC); - unsigned HiReg = MRI.createVirtualRegister(RC); - const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); + const DebugLoc &DL = I.getDebugLoc(); + + APInt Imm(Size, I.getOperand(1).getImm()); + + MachineInstr *ResInst; + if (IsSgpr && TII.isInlineConstant(Imm)) { + ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) + .addImm(I.getOperand(1).getImm()); + } else { + const TargetRegisterClass *RC = IsSgpr ? + &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; + Register LoReg = MRI->createVirtualRegister(RC); + Register HiReg = MRI->createVirtualRegister(RC); - BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) - .addImm(Imm.trunc(32).getZExtValue()); + BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) + .addImm(Imm.trunc(32).getZExtValue()); - BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) - .addImm(Imm.ashr(32).getZExtValue()); + BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) + .addImm(Imm.ashr(32).getZExtValue()); - const MachineInstr *RS = - BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(LoReg) - .addImm(AMDGPU::sub0) - .addReg(HiReg) - .addImm(AMDGPU::sub1); + ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + } // We can't call constrainSelectedInstRegOperands here, because it doesn't // work for target independent opcodes I.eraseFromParent(); const TargetRegisterClass *DstRC = - TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI); + TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); if (!DstRC) return true; - return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); } static bool isConstant(const MachineInstr &MI) { @@ -1188,13 +1483,13 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, GEPInfo GEPInfo(*PtrMI); - for (unsigned i = 1, e = 3; i < e; ++i) { + for (unsigned i = 1; i != 3; ++i) { const MachineOperand &GEPOp = PtrMI->getOperand(i); const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); assert(OpDef); - if (isConstant(*OpDef)) { - // FIXME: Is it possible to have multiple Imm parts? Maybe if we - // are lacking other optimizations. + if (i == 2 && isConstant(*OpDef)) { + // TODO: Could handle constant base + variable offset, but a combine + // probably should have commuted it. assert(GEPInfo.Imm == 0); GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); continue; @@ -1240,16 +1535,26 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { return false; } -bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { - // TODO: Can/should we insert m0 initialization here for DS instructions and - // call the normal selector? - return false; +void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + + const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); + unsigned AS = PtrTy.getAddressSpace(); + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && + STI.ldsRequiresM0Init()) { + // If DS instructions require M0 initializtion, insert it before selecting. + BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addImm(-1); + } +} + +bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { + initM0(I); + return selectImpl(I, *CoverageInfo); } bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &CondOp = I.getOperand(0); Register CondReg = CondOp.getReg(); const DebugLoc &DL = I.getDebugLoc(); @@ -1263,11 +1568,12 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { // GlobalISel, we should push that decision into RegBankSelect. Assume for now // RegBankSelect knows what it's doing if the branch condition is scc, even // though it currently does not. - if (isSCC(CondReg, MRI)) { + if (isSCC(CondReg, *MRI)) { CondPhysReg = AMDGPU::SCC; BrOpcode = AMDGPU::S_CBRANCH_SCC1; - ConstrainRC = &AMDGPU::SReg_32_XM0RegClass; - } else if (isVCC(CondReg, MRI)) { + // FIXME: Hack for isSCC tests + ConstrainRC = &AMDGPU::SGPR_32RegClass; + } else if (isVCC(CondReg, *MRI)) { // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? // We sort of know that a VCC producer based on the register bank, that ands // inactive lanes with 0. What if there was a logical operation with vcc @@ -1279,8 +1585,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { } else return false; - if (!MRI.getRegClassOrNull(CondReg)) - MRI.setRegClass(CondReg, ConstrainRC); + if (!MRI->getRegClassOrNull(CondReg)) + MRI->setRegClass(CondReg, ConstrainRC); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) .addReg(CondReg); @@ -1292,27 +1598,83 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { } bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register DstReg = I.getOperand(0).getReg(); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); if (IsVGPR) I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); return RBI.constrainGenericRegister( - DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI); + DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); +} + +bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { + uint64_t Align = I.getOperand(2).getImm(); + const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); + + MachineBasicBlock *BB = I.getParent(); + + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); + const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; + unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; + unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + const TargetRegisterClass &RegRC + = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; + + LLT Ty = MRI->getType(DstReg); + + const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, + *MRI); + const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, + *MRI); + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || + !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) + return false; + + const DebugLoc &DL = I.getDebugLoc(); + Register ImmReg = MRI->createVirtualRegister(&RegRC); + BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) + .addImm(Mask); + + if (Ty.getSizeInBits() == 32) { + BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) + .addReg(SrcReg) + .addReg(ImmReg); + I.eraseFromParent(); + return true; + } + + Register HiReg = MRI->createVirtualRegister(&RegRC); + Register LoReg = MRI->createVirtualRegister(&RegRC); + Register MaskLo = MRI->createVirtualRegister(&RegRC); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(SrcReg, 0, AMDGPU::sub0); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(SrcReg, 0, AMDGPU::sub1); + + BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) + .addReg(LoReg) + .addReg(ImmReg); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(MaskLo) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + I.eraseFromParent(); + return true; } -bool AMDGPUInstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); - if (!isPreISelGenericOpcode(I.getOpcode())) { + if (!I.isPreISelOpcode()) { if (I.isCopy()) return selectCOPY(I); return true; @@ -1324,16 +1686,18 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, case TargetOpcode::G_XOR: if (selectG_AND_OR_XOR(I)) return true; - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: - if (selectG_ADD_SUB(I)) + if (selectImpl(I, *CoverageInfo)) return true; - LLVM_FALLTHROUGH; - default: - return selectImpl(I, CoverageInfo); + return selectG_ADD_SUB(I); + case TargetOpcode::G_UADDO: + case TargetOpcode::G_USUBO: + return selectG_UADDO_USUBO(I); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: + case TargetOpcode::G_PTRTOINT: return selectCOPY(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: @@ -1353,32 +1717,40 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, case TargetOpcode::G_INSERT: return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: - return selectG_INTRINSIC(I, CoverageInfo); + return selectG_INTRINSIC(I); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: - return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); + return selectG_INTRINSIC_W_SIDE_EFFECTS(I); case TargetOpcode::G_ICMP: if (selectG_ICMP(I)) return true; - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_LOAD: - return selectImpl(I, CoverageInfo); + case TargetOpcode::G_ATOMIC_CMPXCHG: + case TargetOpcode::G_ATOMICRMW_XCHG: + case TargetOpcode::G_ATOMICRMW_ADD: + case TargetOpcode::G_ATOMICRMW_SUB: + case TargetOpcode::G_ATOMICRMW_AND: + case TargetOpcode::G_ATOMICRMW_OR: + case TargetOpcode::G_ATOMICRMW_XOR: + case TargetOpcode::G_ATOMICRMW_MIN: + case TargetOpcode::G_ATOMICRMW_MAX: + case TargetOpcode::G_ATOMICRMW_UMIN: + case TargetOpcode::G_ATOMICRMW_UMAX: + case TargetOpcode::G_ATOMICRMW_FADD: + return selectG_LOAD_ATOMICRMW(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); case TargetOpcode::G_STORE: - if (selectImpl(I, CoverageInfo)) - return true; return selectG_STORE(I); case TargetOpcode::G_TRUNC: return selectG_TRUNC(I); case TargetOpcode::G_SEXT: case TargetOpcode::G_ZEXT: case TargetOpcode::G_ANYEXT: - if (selectG_SZA_EXT(I)) { - I.eraseFromParent(); - return true; - } - - return false; + return selectG_SZA_EXT(I); + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + return selectG_SITOFP_UITOFP(I); case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_FRAME_INDEX: @@ -1388,6 +1760,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, // is checking for G_CONSTANT I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); return true; + case TargetOpcode::G_PTR_MASK: + return selectG_PTR_MASK(I); + default: + return selectImpl(I, *CoverageInfo); } return false; } @@ -1402,14 +1778,14 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( - Register Src, const MachineRegisterInfo &MRI) const { + Register Src) const { unsigned Mods = 0; - MachineInstr *MI = MRI.getVRegDef(Src); + MachineInstr *MI = MRI->getVRegDef(Src); if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::NEG; - MI = MRI.getVRegDef(Src); + MI = MRI->getVRegDef(Src); } if (MI && MI->getOpcode() == AMDGPU::G_FABS) { @@ -1432,12 +1808,23 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + }}; +} +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1446,6 +1833,7 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod }}; } + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { return {{ @@ -1457,12 +1845,9 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); - Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1471,12 +1856,28 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); +AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { + // FIXME: Handle clamp and op_sel + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { + // FIXME: Handle op_sel + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + }}; +} +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) return None; @@ -1496,11 +1897,8 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); - SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) return None; @@ -1521,10 +1919,9 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*MI, MRI, AddrInfo); + getAddrModeInfo(*MI, *MRI, AddrInfo); // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, // then we can select all ptr + 32-bit offsets not just immediate offsets. @@ -1540,7 +1937,7 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { // failed trying to select this load into one of the _IMM variants since // the _IMM Patterns are considered before the _SGPR patterns. unsigned PtrReg = GEPInfo.SgprParts[0]; - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(GEPInfo.Imm); return {{ @@ -1553,8 +1950,6 @@ template <bool Signed> InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); InstructionSelector::ComplexRendererFns Default = {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, @@ -1565,12 +1960,12 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { if (!STI.hasFlatInstOffsets()) return Default; - const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg()); + const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP) return Default; Optional<int64_t> Offset = - getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI); + getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); if (!Offset.hasValue()) return Default; @@ -1597,12 +1992,6 @@ AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { return selectFlatOffsetImpl<true>(Root); } -// FIXME: Implement -static bool signBitIsZero(const MachineOperand &Op, - const MachineRegisterInfo &MRI) { - return false; -} - static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); return PSV && PSV->isStack(); @@ -1613,12 +2002,11 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); int64_t Offset = 0; - if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) { - Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { + Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); // TODO: Should this be inside the render function? The iterator seems to // move. @@ -1652,18 +2040,18 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { // offsets. Optional<int> FI; Register VAddr = Root.getReg(); - if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) { - if (isBaseWithConstantOffset(Root, MRI)) { + if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { + if (isBaseWithConstantOffset(Root, *MRI)) { const MachineOperand &LHS = RootDef->getOperand(1); const MachineOperand &RHS = RootDef->getOperand(2); - const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); - const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); + const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); if (LHSDef && RHSDef) { int64_t PossibleOffset = RHSDef->getOperand(1).getCImm()->getSExtValue(); if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && (!STI.privateMemoryResourceIsRangeChecked() || - signBitIsZero(LHS, MRI))) { + KnownBits->signBitIsZero(LHS.getReg()))) { if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) FI = LHSDef->getOperand(1).getIndex(); else @@ -1700,15 +2088,30 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { }}}; } +bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, + const MachineOperand &Base, + int64_t Offset, + unsigned OffsetBits) const { + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return KnownBits->signBitIsZero(Base.getReg()); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); int64_t Offset = 0; - if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) || + if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; @@ -1728,3 +2131,54 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { + const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); + if (!RootDef) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + }}; + } + + int64_t ConstAddr = 0; + if (isBaseWithConstantOffset(Root, *MRI)) { + const MachineOperand &LHS = RootDef->getOperand(1); + const MachineOperand &RHS = RootDef->getOperand(2); + const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); + const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t PossibleOffset = + RHSDef->getOperand(1).getCImm()->getSExtValue(); + if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { + // (add n0, c0) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } + }}; + } + } + } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { + + + + } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { + + + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + }}; +} + +void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); + assert(CstVal && "Expected constant value"); + MIB.addImm(CstVal.getValue()); +} diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 4f489ddfb23d..d3c83a6a872a 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -35,6 +35,7 @@ class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class GCNSubtarget; class MachineInstr; +class MachineIRBuilder; class MachineOperand; class MachineRegisterInfo; class SIInstrInfo; @@ -42,14 +43,20 @@ class SIMachineFunctionInfo; class SIRegisterInfo; class AMDGPUInstructionSelector : public InstructionSelector { +private: + MachineRegisterInfo *MRI; + public: AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName(); + void setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) override; + private: struct GEPInfo { const MachineInstr &GEP; @@ -72,32 +79,42 @@ private: bool selectPHI(MachineInstr &I) const; bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; + bool selectG_SITOFP_UITOFP(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; bool selectG_AND_OR_XOR(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; + bool selectG_UADDO_USUBO(MachineInstr &I) const; bool selectG_EXTRACT(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; - bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; - bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const; + bool selectG_INTRINSIC(MachineInstr &I) const; + + std::tuple<Register, unsigned, unsigned> + splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; + + bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const; + + bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; bool selectG_ICMP(MachineInstr &I) const; bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const; bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const; - bool selectG_LOAD(MachineInstr &I) const; - bool selectG_SELECT(MachineInstr &I) const; + + void initM0(MachineInstr &I) const; + bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const; bool selectG_STORE(MachineInstr &I) const; + bool selectG_SELECT(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; bool selectG_FRAME_INDEX(MachineInstr &I) const; + bool selectG_PTR_MASK(MachineInstr &I) const; std::pair<Register, unsigned> - selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + selectVOP3ModsImpl(Register Src) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -108,11 +125,18 @@ private: InstructionSelector::ComplexRendererFns selectVOP3Mods0(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3OpSelMods0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3OpSelMods(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSmrdImm32(MachineOperand &Root) const; @@ -133,6 +157,16 @@ private: InstructionSelector::ComplexRendererFns selectMUBUFScratchOffset(MachineOperand &Root) const; + bool isDSOffsetLegal(const MachineRegisterInfo &MRI, + const MachineOperand &Base, + int64_t Offset, unsigned OffsetBits) const; + + InstructionSelector::ComplexRendererFns + selectDS1Addr1Offset(MachineOperand &Root) const; + + void renderTruncImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 61bc415c839d..846e7f577a28 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -75,7 +75,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> let isCodeGenOnly = 1; } -def TruePredicate : Predicate<"true">; +def TruePredicate : Predicate<"">; class PredicateControl { Predicate SubtargetPredicate = TruePredicate; @@ -220,80 +220,48 @@ def hi_f16_elt : PatLeaf< // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// -def COND_OEQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] ->; - -def COND_ONE : PatLeaf < - (cond), - [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] ->; - -def COND_OGT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] ->; - -def COND_OGE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] ->; - -def COND_OLT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] ->; - -def COND_OLE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] ->; - -def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; -def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; +def COND_OEQ : PatFrags<(ops), [(OtherVT SETOEQ), (OtherVT SETEQ)]>; +def COND_ONE : PatFrags<(ops), [(OtherVT SETONE), (OtherVT SETNE)]>; +def COND_OGT : PatFrags<(ops), [(OtherVT SETOGT), (OtherVT SETGT)]>; +def COND_OGE : PatFrags<(ops), [(OtherVT SETOGE), (OtherVT SETGE)]>; +def COND_OLT : PatFrags<(ops), [(OtherVT SETOLT), (OtherVT SETLT)]>; +def COND_OLE : PatFrags<(ops), [(OtherVT SETOLE), (OtherVT SETLE)]>; +def COND_O : PatFrags<(ops), [(OtherVT SETO)]>; +def COND_UO : PatFrags<(ops), [(OtherVT SETUO)]>; //===----------------------------------------------------------------------===// // PatLeafs for unsigned / unordered comparisons //===----------------------------------------------------------------------===// -def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; -def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; -def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; -def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; -def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; -def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; +def COND_UEQ : PatFrag<(ops), (OtherVT SETUEQ)>; +def COND_UNE : PatFrag<(ops), (OtherVT SETUNE)>; +def COND_UGT : PatFrag<(ops), (OtherVT SETUGT)>; +def COND_UGE : PatFrag<(ops), (OtherVT SETUGE)>; +def COND_ULT : PatFrag<(ops), (OtherVT SETULT)>; +def COND_ULE : PatFrag<(ops), (OtherVT SETULE)>; // XXX - For some reason R600 version is preferring to use unordered // for setne? -def COND_UNE_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] ->; +def COND_UNE_NE : PatFrags<(ops), [(OtherVT SETUNE), (OtherVT SETNE)]>; //===----------------------------------------------------------------------===// // PatLeafs for signed comparisons //===----------------------------------------------------------------------===// -def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; -def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; -def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; -def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; +def COND_SGT : PatFrag<(ops), (OtherVT SETGT)>; +def COND_SGE : PatFrag<(ops), (OtherVT SETGE)>; +def COND_SLT : PatFrag<(ops), (OtherVT SETLT)>; +def COND_SLE : PatFrag<(ops), (OtherVT SETLE)>; //===----------------------------------------------------------------------===// // PatLeafs for integer equality //===----------------------------------------------------------------------===// -def COND_EQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] ->; - -def COND_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] ->; +def COND_EQ : PatFrags<(ops), [(OtherVT SETEQ), (OtherVT SETUEQ)]>; +def COND_NE : PatFrags<(ops), [(OtherVT SETNE), (OtherVT SETUNE)]>; +// FIXME: Should not need code predicate +//def COND_NULL : PatLeaf<(OtherVT null_frag)>; def COND_NULL : PatLeaf < (cond), [{(void)N; return false;}] @@ -335,17 +303,17 @@ def TEX_SHADOW_ARRAY : PatLeaf< // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + class AddressSpaceList<list<int> AS> { list<int> AddrSpaces = AS; } -class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAlignment() % 8 == 0; -}]>; - -class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAlignment() >= 16; -}]>; +class Aligned<int Bytes> { + int MinAlignment = Bytes; +} class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>; @@ -502,6 +470,35 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>; } // End foreach AddrSpace +multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { + foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { + let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { + defm "_"#as : binary_atomic_op<atomic_op, IsInt>; + + let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in { + defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>; + } + + let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in { + defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>; + } + } + } +} + +defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>; +defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>; +defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>; +defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>; +defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>; +defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>; +defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>; +defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>; +defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>; +defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>; +defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>; + + def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress; def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress; @@ -513,21 +510,31 @@ def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress; def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress; def atomic_store_local : LocalStore <atomic_store>; -def load_align8_local : Aligned8Bytes < - (ops node:$ptr), (load_local node:$ptr) ->; -def load_align16_local : Aligned16Bytes < - (ops node:$ptr), (load_local node:$ptr) ->; +def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 8; +} -def store_align8_local : Aligned8Bytes < - (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) ->; +def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 16; +} + +def store_align8_local: PatFrag<(ops node:$val, node:$ptr), + (store_local node:$val, node:$ptr)>, Aligned<8> { + let IsStore = 1; + let IsTruncStore = 0; +} + +def store_align16_local: PatFrag<(ops node:$val, node:$ptr), + (store_local node:$val, node:$ptr)>, Aligned<16> { + let IsStore = 1; + let IsTruncStore = 0; +} -def store_align16_local : Aligned16Bytes < - (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) ->; def atomic_store_flat : FlatStore <atomic_store>; def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress; @@ -547,69 +554,26 @@ class region_binary_atomic_op<SDNode atomic_op> : }]>; -def atomic_swap_local : local_binary_atomic_op<atomic_swap>; -def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; -def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; -def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>; -def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>; -def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>; -def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>; -def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>; -def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>; -def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>; -def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; - def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; -class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag< - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; - -class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag< - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; -}]>; +let AddressSpaces = StoreAddress_local.AddrSpaces in { +defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; +} -def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; +let AddressSpaces = StoreAddress_region.AddrSpaces in { +defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; +} class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; -multiclass global_binary_atomic_op<SDNode atomic_op> { - def "" : global_binary_atomic_op_frag<atomic_op>; - - def _noret : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; - - def _ret : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; -} - -defm atomic_swap_global : global_binary_atomic_op<atomic_swap>; -defm atomic_add_global : global_binary_atomic_op<atomic_load_add>; -defm atomic_and_global : global_binary_atomic_op<atomic_load_and>; -defm atomic_max_global : global_binary_atomic_op<atomic_load_max>; -defm atomic_min_global : global_binary_atomic_op<atomic_load_min>; -defm atomic_or_global : global_binary_atomic_op<atomic_load_or>; -defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; -defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; -defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; -defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; - // Legacy. def AMDGPUatomic_cmp_swap_global : PatFrag< (ops node:$ptr, node:$value), diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 670f6225fbf7..5aba35a19ced 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -11,6 +11,13 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#if defined(_MSC_VER) || defined(__MINGW32__) +// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI +// from the Visual C++ cmath / math.h headers: +// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 +#define _USE_MATH_DEFINES +#endif + #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPUTargetMachine.h" @@ -20,6 +27,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" @@ -32,7 +40,7 @@ using namespace LegalityPredicates; static LegalityPredicate isMultiple32(unsigned TypeIdx, - unsigned MaxSize = 512) { + unsigned MaxSize = 1024) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; const LLT EltTy = Ty.getScalarType(); @@ -40,12 +48,27 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx, }; } +static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx].getSizeInBits() == Size; + }; +} + static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; return Ty.isVector() && Ty.getNumElements() % 2 != 0 && - Ty.getElementType().getSizeInBits() < 32; + Ty.getElementType().getSizeInBits() < 32 && + Ty.getSizeInBits() % 32 != 0; + }; +} + +static LegalityPredicate isWideVec16(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getScalarType(); + return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; }; } @@ -68,6 +91,31 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { }; } +// Increase the number of vector elements to reach the next multiple of 32-bit +// type. +static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + + const LLT EltTy = Ty.getElementType(); + const int Size = Ty.getSizeInBits(); + const int EltSize = EltTy.getSizeInBits(); + const int NextMul32 = (Size + 31) / 32; + + assert(EltSize < 32); + + const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; + return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); + }; +} + +static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; + }; +} + static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -82,7 +130,7 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { }; } -// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of +// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of // v2s16. static LegalityPredicate isRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { @@ -94,7 +142,21 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) { EltSize == 128 || EltSize == 256; } - return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; + return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; + }; +} + +static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx].getElementType() == Type; + }; +} + +static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return !Ty.isVector() && Ty.getSizeInBits() > 32 && + Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); }; } @@ -112,9 +174,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); + const LLT S96 = LLT::scalar(96); const LLT S128 = LLT::scalar(128); const LLT S256 = LLT::scalar(256); - const LLT S512 = LLT::scalar(512); + const LLT S1024 = LLT::scalar(1024); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); @@ -134,6 +197,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT V14S32 = LLT::vector(14, 32); const LLT V15S32 = LLT::vector(15, 32); const LLT V16S32 = LLT::vector(16, 32); + const LLT V32S32 = LLT::vector(32, 32); const LLT V2S64 = LLT::vector(2, 64); const LLT V3S64 = LLT::vector(3, 64); @@ -142,16 +206,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT V6S64 = LLT::vector(6, 64); const LLT V7S64 = LLT::vector(7, 64); const LLT V8S64 = LLT::vector(8, 64); + const LLT V16S64 = LLT::vector(16, 64); std::initializer_list<LLT> AllS32Vectors = {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, - V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; + V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; std::initializer_list<LLT> AllS64Vectors = - {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; + {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); + const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); + const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); @@ -162,7 +229,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; const std::initializer_list<LLT> AddrSpaces32 = { - LocalPtr, PrivatePtr + LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr }; const std::initializer_list<LLT> FPTypesBase = { @@ -216,37 +283,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) .clampScalar(0, S32, S64) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) .widenScalarToNextPow2(0) .scalarize(0); - getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, + getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) .legalFor({{S32, S1}}) - .clampScalar(0, S32, S32); + .clampScalar(0, S32, S32) + .scalarize(0); // TODO: Implement. + + getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) + .lower(); getActionDefinitionsBuilder(G_BITCAST) - .legalForCartesianProduct({S32, V2S16}) - .legalForCartesianProduct({S64, V2S32, V4S16}) - .legalForCartesianProduct({V2S64, V4S32}) // Don't worry about the size constraint. - .legalIf(all(isPointer(0), isPointer(1))); + .legalIf(all(isRegisterType(0), isRegisterType(1))) + // FIXME: Testing hack + .legalForCartesianProduct({S16, LLT::vector(2, 8), }); - if (ST.has16BitInsts()) { - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64, S16}) - .clampScalar(0, S16, S64); - } else { - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64}) - .clampScalar(0, S32, S64); - } + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64, S16}) + .clampScalar(0, S16, S64); getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, + .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .clampScalarOrElt(0, S32, S512) + .clampScalarOrElt(0, S32, S1024) .legalIf(isMultiple32(0)) .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16); @@ -256,23 +320,33 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // values may not be legal. We need to figure out how to distinguish // between these two scenarios. getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({S1, S32, S64, GlobalPtr, + .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) .clampScalar(0, S32, S64) .widenScalarToNextPow2(0) .legalIf(isPointer(0)); setAction({G_FRAME_INDEX, PrivatePtr}, Legal); + getActionDefinitionsBuilder(G_GLOBAL_VALUE) + .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); + auto &FPOpActions = getActionDefinitionsBuilder( - { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) + { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) .legalFor({S32, S64}); + auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) + .customFor({S32, S64}); + auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) + .customFor({S32, S64}); if (ST.has16BitInsts()) { if (ST.hasVOP3PInsts()) FPOpActions.legalFor({S16, V2S16}); else FPOpActions.legalFor({S16}); + + TrigActions.customFor({S16}); + FDIVActions.customFor({S16}); } auto &MinNumMaxNum = getActionDefinitionsBuilder({ @@ -293,22 +367,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); - if (ST.hasVOP3PInsts()) FPOpActions.clampMaxNumElements(0, S16, 2); + FPOpActions .scalarize(0) .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); + TrigActions + .scalarize(0) + .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); + + FDIVActions + .scalarize(0) + .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); + + getActionDefinitionsBuilder({G_FNEG, G_FABS}) + .legalFor(FPTypesPK16) + .clampMaxNumElements(0, S16, 2) + .scalarize(0) + .clampScalar(0, S16, S64); + + // TODO: Implement + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + if (ST.has16BitInsts()) { - getActionDefinitionsBuilder(G_FSQRT) + getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); } else { - getActionDefinitionsBuilder(G_FSQRT) + getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) .legalFor({S32, S64}) .scalarize(0) .clampScalar(0, S32, S64); @@ -334,23 +423,43 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S32, S64); + // Whether this is legal depends on the floating point mode for the function. + auto &FMad = getActionDefinitionsBuilder(G_FMAD); + if (ST.hasMadF16()) + FMad.customFor({S32, S16}); + else + FMad.customFor({S32}); + FMad.scalarize(0) + .lower(); + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, {S32, S1}, {S64, S1}, {S16, S1}, + {S96, S32}, // FIXME: Hack {S64, LLT::scalar(33)}, {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) .scalarize(0); - getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalFor({{S32, S32}, {S64, S32}}) + // TODO: Split s1->s64 during regbankselect for VALU. + auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) + .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) .lowerFor({{S32, S64}}) - .customFor({{S64, S64}}) - .scalarize(0); + .customFor({{S64, S64}}); + if (ST.has16BitInsts()) + IToFP.legalFor({{S16, S16}}); + IToFP.clampScalar(1, S32, S64) + .scalarize(0); - getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalFor({{S32, S32}, {S32, S64}}) - .scalarize(0); + auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); + if (ST.has16BitInsts()) + FPToI.legalFor({{S16, S16}}); + else + FPToI.minScalar(1, S32); + + FPToI.minScalar(0, S32) + .scalarize(0); getActionDefinitionsBuilder(G_INTRINSIC_ROUND) .legalFor({S32, S64}) @@ -374,6 +483,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalForCartesianProduct(AddrSpaces32, {S32}) .scalarize(0); + getActionDefinitionsBuilder(G_PTR_MASK) + .scalarize(0) + .alwaysLegal(); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); auto &CmpBuilder = @@ -415,7 +528,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(1, 32); // TODO: Expand for > s32 - getActionDefinitionsBuilder(G_BSWAP) + getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) .legalFor({S32}) .clampScalar(0, S32, S32) .scalarize(0); @@ -491,87 +604,239 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); }); - if (ST.hasFlatAddressSpace()) { - getActionDefinitionsBuilder(G_ADDRSPACE_CAST) - .scalarize(0) - .custom(); - } + getActionDefinitionsBuilder(G_ADDRSPACE_CAST) + .scalarize(0) + .custom(); // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. - getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .narrowScalarIf([](const LegalityQuery &Query) { - unsigned Size = Query.Types[0].getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - return (Size > 32 && MemSize < Size); - }, - [](const LegalityQuery &Query) { - return std::make_pair(0, LLT::scalar(32)); - }) - .fewerElementsIf([=](const LegalityQuery &Query) { - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - return (MemSize == 96) && - Query.Types[0].isVector() && - !ST.hasDwordx3LoadStores(); - }, - [=](const LegalityQuery &Query) { - return std::make_pair(0, V2S32); - }) - .legalIf([=](const LegalityQuery &Query) { - const LLT &Ty0 = Query.Types[0]; + auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { + switch (AS) { + // FIXME: Private element size. + case AMDGPUAS::PRIVATE_ADDRESS: + return 32; + // FIXME: Check subtarget + case AMDGPUAS::LOCAL_ADDRESS: + return ST.useDS128() ? 128 : 64; - unsigned Size = Ty0.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - if (Size < 32 || (Size > 32 && MemSize < Size)) - return false; + // Treat constant and global as identical. SMRD loads are sometimes usable + // for global loads (ideally constant address space should be eliminated) + // depending on the context. Legality cannot be context dependent, but + // RegBankSelect can split the load as necessary depending on the pointer + // register bank/uniformity and if the memory is invariant or not written in + // a kernel. + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + return 512; + default: + return 128; + } + }; - if (Ty0.isVector() && Size != MemSize) - return false; + const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { + const LLT DstTy = Query.Types[0]; - // TODO: Decompose private loads into 4-byte components. - // TODO: Illegal flat loads on SI - switch (MemSize) { - case 8: - case 16: - return Size == 32; - case 32: - case 64: - case 128: - return true; + // Split vector extloads. + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) + return true; - case 96: - return ST.hasDwordx3LoadStores(); + const LLT PtrTy = Query.Types[1]; + unsigned AS = PtrTy.getAddressSpace(); + if (MemSize > maxSizeForAddrSpace(AS)) + return true; - case 256: - case 512: - // TODO: Possibly support loads of i256 and i512 . This will require - // adding i256 and i512 types to MVT in order for to be able to use - // TableGen. - // TODO: Add support for other vector types, this will require - // defining more value mappings for the new types. - return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || - Ty0.getScalarType().getSizeInBits() == 64); + // Catch weird sized loads that don't evenly divide into the access sizes + // TODO: May be able to widen depending on alignment etc. + unsigned NumRegs = MemSize / 32; + if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) + return true; - default: - return false; - } - }) - .clampScalar(0, S32, S64); + unsigned Align = Query.MMODescrs[0].AlignInBits; + if (Align < MemSize) { + const SITargetLowering *TLI = ST.getTargetLowering(); + return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); + } + + return false; + }; + + unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; + unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; + unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; + + // TODO: Refine based on subtargets which support unaligned access or 128-bit + // LDS + // TODO: Unsupported flat for SI. + + for (unsigned Op : {G_LOAD, G_STORE}) { + const bool IsStore = Op == G_STORE; + + auto &Actions = getActionDefinitionsBuilder(Op); + // Whitelist the common cases. + // TODO: Pointer loads + // TODO: Wide constant loads + // TODO: Only CI+ has 3x loads + // TODO: Loads to s16 on gfx9 + Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, + {V2S32, GlobalPtr, 64, GlobalAlign32}, + {V3S32, GlobalPtr, 96, GlobalAlign32}, + {S96, GlobalPtr, 96, GlobalAlign32}, + {V4S32, GlobalPtr, 128, GlobalAlign32}, + {S128, GlobalPtr, 128, GlobalAlign32}, + {S64, GlobalPtr, 64, GlobalAlign32}, + {V2S64, GlobalPtr, 128, GlobalAlign32}, + {V2S16, GlobalPtr, 32, GlobalAlign32}, + {S32, GlobalPtr, 8, GlobalAlign8}, + {S32, GlobalPtr, 16, GlobalAlign16}, + + {S32, LocalPtr, 32, 32}, + {S64, LocalPtr, 64, 32}, + {V2S32, LocalPtr, 64, 32}, + {S32, LocalPtr, 8, 8}, + {S32, LocalPtr, 16, 16}, + {V2S16, LocalPtr, 32, 32}, + + {S32, PrivatePtr, 32, 32}, + {S32, PrivatePtr, 8, 8}, + {S32, PrivatePtr, 16, 16}, + {V2S16, PrivatePtr, 32, 32}, + + {S32, FlatPtr, 32, GlobalAlign32}, + {S32, FlatPtr, 16, GlobalAlign16}, + {S32, FlatPtr, 8, GlobalAlign8}, + {V2S16, FlatPtr, 32, GlobalAlign32}, + + {S32, ConstantPtr, 32, GlobalAlign32}, + {V2S32, ConstantPtr, 64, GlobalAlign32}, + {V3S32, ConstantPtr, 96, GlobalAlign32}, + {V4S32, ConstantPtr, 128, GlobalAlign32}, + {S64, ConstantPtr, 64, GlobalAlign32}, + {S128, ConstantPtr, 128, GlobalAlign32}, + {V2S32, ConstantPtr, 32, GlobalAlign32}}); + Actions + .customIf(typeIs(1, Constant32Ptr)) + .narrowScalarIf( + [=](const LegalityQuery &Query) -> bool { + return !Query.Types[0].isVector() && needToSplitLoad(Query); + }, + [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { + const LLT DstTy = Query.Types[0]; + const LLT PtrTy = Query.Types[1]; + + const unsigned DstSize = DstTy.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + + // Split extloads. + if (DstSize > MemSize) + return std::make_pair(0, LLT::scalar(MemSize)); + + if (DstSize > 32 && (DstSize % 32 != 0)) { + // FIXME: Need a way to specify non-extload of larger size if + // suitably aligned. + return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); + } + + unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + if (MemSize > MaxSize) + return std::make_pair(0, LLT::scalar(MaxSize)); + + unsigned Align = Query.MMODescrs[0].AlignInBits; + return std::make_pair(0, LLT::scalar(Align)); + }) + .fewerElementsIf( + [=](const LegalityQuery &Query) -> bool { + return Query.Types[0].isVector() && needToSplitLoad(Query); + }, + [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { + const LLT DstTy = Query.Types[0]; + const LLT PtrTy = Query.Types[1]; + + LLT EltTy = DstTy.getElementType(); + unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + + // Split if it's too large for the address space. + if (Query.MMODescrs[0].SizeInBits > MaxSize) { + unsigned NumElts = DstTy.getNumElements(); + unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; + // FIXME: Refine when odd breakdowns handled + // The scalars will need to be re-legalized. + if (NumPieces == 1 || NumPieces >= NumElts || + NumElts % NumPieces != 0) + return std::make_pair(0, EltTy); + + return std::make_pair(0, + LLT::vector(NumElts / NumPieces, EltTy)); + } + + // Need to split because of alignment. + unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize > Align && + (EltSize / Align < DstTy.getNumElements())) { + return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); + } + + // May need relegalization for the scalars. + return std::make_pair(0, EltTy); + }) + .minScalar(0, S32); + + if (IsStore) + Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); + + // TODO: Need a bitcast lower option? + Actions + .legalIf([=](const LegalityQuery &Query) { + const LLT Ty0 = Query.Types[0]; + unsigned Size = Ty0.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned Align = Query.MMODescrs[0].AlignInBits; + + // No extending vector loads. + if (Size > MemSize && Ty0.isVector()) + return false; + + // FIXME: Widening store from alignment not valid. + if (MemSize < Size) + MemSize = std::max(MemSize, Align); + + switch (MemSize) { + case 8: + case 16: + return Size == 32; + case 32: + case 64: + case 128: + return true; + case 96: + return ST.hasDwordx3LoadStores(); + case 256: + case 512: + return true; + default: + return false; + } + }) + .widenScalarToNextPow2(0) + // TODO: v3s32->v4s32 with alignment + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); + } - // FIXME: Handle alignment requirements. auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .legalForTypesWithMemDesc({ - {S32, GlobalPtr, 8, 8}, - {S32, GlobalPtr, 16, 8}, - {S32, LocalPtr, 8, 8}, - {S32, LocalPtr, 16, 8}, - {S32, PrivatePtr, 8, 8}, - {S32, PrivatePtr, 16, 8}}); + .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, + {S32, GlobalPtr, 16, 2 * 8}, + {S32, LocalPtr, 8, 8}, + {S32, LocalPtr, 16, 16}, + {S32, PrivatePtr, 8, 8}, + {S32, PrivatePtr, 16, 16}, + {S32, ConstantPtr, 8, 8}, + {S32, ConstantPtr, 16, 2 * 8}}); if (ST.hasFlatAddressSpace()) { - ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, - {S32, FlatPtr, 16, 8}}); + ExtLoads.legalForTypesWithMemDesc( + {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); } ExtLoads.clampScalar(0, S32, S32) @@ -590,6 +855,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } + getActionDefinitionsBuilder(G_ATOMICRMW_FADD) + .legalFor({{S32, LocalPtr}}); + + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) + .lower(); + // TODO: Pointer types, any 32-bit or 64-bit vector getActionDefinitionsBuilder(G_SELECT) .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, @@ -643,7 +914,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return (EltTy.getSizeInBits() == 16 || EltTy.getSizeInBits() % 32 == 0) && VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= 512 && + VecTy.getSizeInBits() <= 1024 && IdxTy.getSizeInBits() == 32; }) .clampScalar(EltTypeIdx, S32, S64) @@ -663,6 +934,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Doesn't handle extract of illegal sizes. getActionDefinitionsBuilder(Op) + .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) + // FIXME: Multiples of 16 should not be legal. .legalIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; const LLT LitTy = Query.Types[LitTyIdx]; @@ -686,18 +959,36 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } - getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalForCartesianProduct(AllS32Vectors, {S32}) - .legalForCartesianProduct(AllS64Vectors, {S64}) - .clampNumElements(0, V16S32, V16S32) - .clampNumElements(0, V2S64, V8S64) - .minScalarSameAs(1, 0) - .legalIf(isRegisterType(0)) - .minScalarOrElt(0, S32); + auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V32S32) + .clampNumElements(0, V2S64, V16S64) + .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); + + if (ST.hasScalarPackInsts()) + BuildVector.legalFor({V2S16, S32}); + + BuildVector + .minScalarSameAs(1, 0) + .legalIf(isRegisterType(0)) + .minScalarOrElt(0, S32); + + if (ST.hasScalarPackInsts()) { + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) + .legalFor({V2S16, S32}) + .lower(); + } else { + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) + .lower(); + } getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalIf(isRegisterType(0)); + // TODO: Don't fully scalarize v2s16 pieces + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); + // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; @@ -715,14 +1006,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return false; }; - getActionDefinitionsBuilder(Op) + auto &Builder = getActionDefinitionsBuilder(Op) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) // Clamp the little scalar to s8-s256 and make it a power of 2. It's not // worth considering the multiples of 64 since 2*192 and 2*384 are not // valid. .clampScalar(LitTyIdx, S16, S256) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) - + .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) + .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), + elementTypeIs(1, S16)), + changeTo(1, V2S16)) // Break up vectors with weird elements into scalars .fewerElementsIf( [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, @@ -730,25 +1024,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .fewerElementsIf( [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, scalarize(1)) - .clampScalar(BigTyIdx, S32, S512) - .widenScalarIf( + .clampScalar(BigTyIdx, S32, S1024) + .lowerFor({{S16, V2S16}}); + + if (Op == G_MERGE_VALUES) { + Builder.widenScalarIf( + // TODO: Use 16-bit shifts if legal for 8-bit values? [=](const LegalityQuery &Query) { - const LLT &Ty = Query.Types[BigTyIdx]; - return !isPowerOf2_32(Ty.getSizeInBits()) && - Ty.getSizeInBits() % 16 != 0; + const LLT Ty = Query.Types[LitTyIdx]; + return Ty.getSizeInBits() < 32; }, - [=](const LegalityQuery &Query) { - // Pick the next power of 2, or a multiple of 64 over 128. - // Whichever is smaller. - const LLT &Ty = Query.Types[BigTyIdx]; - unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); - if (NewSizeInBits >= 256) { - unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); - if (RoundedTo < NewSizeInBits) - NewSizeInBits = RoundedTo; - } - return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); - }) + changeTo(LitTyIdx, S32)); + } + + Builder.widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 16 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) .legalIf([=](const LegalityQuery &Query) { const LLT &BigTy = Query.Types[BigTyIdx]; const LLT &LitTy = Query.Types[LitTyIdx]; @@ -760,43 +1066,56 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return BigTy.getSizeInBits() % 16 == 0 && LitTy.getSizeInBits() % 16 == 0 && - BigTy.getSizeInBits() <= 512; + BigTy.getSizeInBits() <= 1024; }) // Any vectors left are the wrong size. Scalarize them. .scalarize(0) .scalarize(1); } + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + computeTables(); verify(*ST.getInstrInfo()); } bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, + MachineIRBuilder &B, GISelChangeObserver &Observer) const { switch (MI.getOpcode()) { case TargetOpcode::G_ADDRSPACE_CAST: - return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); + return legalizeAddrSpaceCast(MI, MRI, B); case TargetOpcode::G_FRINT: - return legalizeFrint(MI, MRI, MIRBuilder); + return legalizeFrint(MI, MRI, B); case TargetOpcode::G_FCEIL: - return legalizeFceil(MI, MRI, MIRBuilder); + return legalizeFceil(MI, MRI, B); case TargetOpcode::G_INTRINSIC_TRUNC: - return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); + return legalizeIntrinsicTrunc(MI, MRI, B); case TargetOpcode::G_SITOFP: - return legalizeITOFP(MI, MRI, MIRBuilder, true); + return legalizeITOFP(MI, MRI, B, true); case TargetOpcode::G_UITOFP: - return legalizeITOFP(MI, MRI, MIRBuilder, false); + return legalizeITOFP(MI, MRI, B, false); case TargetOpcode::G_FMINNUM: case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINNUM_IEEE: case TargetOpcode::G_FMAXNUM_IEEE: - return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); + return legalizeMinNumMaxNum(MI, MRI, B); case TargetOpcode::G_EXTRACT_VECTOR_ELT: - return legalizeExtractVectorElt(MI, MRI, MIRBuilder); + return legalizeExtractVectorElt(MI, MRI, B); case TargetOpcode::G_INSERT_VECTOR_ELT: - return legalizeInsertVectorElt(MI, MRI, MIRBuilder); + return legalizeInsertVectorElt(MI, MRI, B); + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FCOS: + return legalizeSinCos(MI, MRI, B); + case TargetOpcode::G_GLOBAL_VALUE: + return legalizeGlobalValue(MI, MRI, B); + case TargetOpcode::G_LOAD: + return legalizeLoad(MI, MRI, B, Observer); + case TargetOpcode::G_FMAD: + return legalizeFMad(MI, MRI, B); + case TargetOpcode::G_FDIV: + return legalizeFDIV(MI, MRI, B); default: return false; } @@ -807,11 +1126,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, Register AMDGPULegalizerInfo::getSegmentAperture( unsigned AS, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const LLT S32 = LLT::scalar(32); + assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); + if (ST.hasApertureRegs()) { // FIXME: Use inline constants (src_{shared, private}_base) instead of // getreg. @@ -829,13 +1150,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register ApertureReg = MRI.createGenericVirtualRegister(S32); Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) + B.buildInstr(AMDGPU::S_GETREG_B32) .addDef(GetReg) .addImm(Encoding); MRI.setType(GetReg, S32); - auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); - MIRBuilder.buildInstr(TargetOpcode::G_SHL) + auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); + B.buildInstr(TargetOpcode::G_SHL) .addDef(ApertureReg) .addUse(GetReg) .addUse(ShiftAmt.getReg(0)); @@ -846,8 +1167,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - // FIXME: Placeholder until we can track the input registers. - MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) + return Register(); // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. @@ -870,18 +1192,19 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register LoadResult = MRI.createGenericVirtualRegister(S32); Register LoadAddr; - MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); - MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); + B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + B.buildLoad(LoadResult, LoadAddr, *MMO); return LoadResult; } bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); - MIRBuilder.setInstr(MI); + B.setInstr(MI); + const LLT S32 = LLT::scalar(32); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -899,7 +1222,28 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { - MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); + MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); + return true; + } + + if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + // Truncate. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + + if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + uint32_t AddrHiVal = Info->get32BitAddressHighBits(); + + // FIXME: This is a bit ugly due to creating a merge of 2 pointers to + // another. Merge operands are required to be the same type, but creating an + // extra ptrtoint would be kind of pointless. + auto HighAddr = B.buildConstant( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); + B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); + MI.eraseFromParent(); return true; } @@ -908,47 +1252,52 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( DestAS == AMDGPUAS::PRIVATE_ADDRESS); unsigned NullVal = TM.getNullPointerValue(DestAS); - auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); - auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); + auto SegmentNull = B.buildConstant(DstTy, NullVal); + auto FlatNull = B.buildConstant(SrcTy, 0); Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); // Extract low 32-bits of the pointer. - MIRBuilder.buildExtract(PtrLo32, Src, 0); + B.buildExtract(PtrLo32, Src, 0); Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); - MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); + B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); + B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); MI.eraseFromParent(); return true; } - assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || - SrcAS == AMDGPUAS::PRIVATE_ADDRESS); + if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) + return false; + + if (!ST.hasFlatAddressSpace()) + return false; auto SegmentNull = - MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); auto FlatNull = - MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); + if (!ApertureReg.isValid()) + return false; Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); + B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); // Coerce the type of the low half of the result so we can use merge_values. - Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) + Register SrcAsInt = MRI.createGenericVirtualRegister(S32); + B.buildInstr(TargetOpcode::G_PTRTOINT) .addDef(SrcAsInt) .addUse(Src); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? - MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); - MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); + B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); MI.eraseFromParent(); return true; @@ -956,8 +1305,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( bool AMDGPULegalizerInfo::legalizeFrint( MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MIRBuilder.setInstr(MI); + MachineIRBuilder &B) const { + B.setInstr(MI); Register Src = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(Src); @@ -966,18 +1315,18 @@ bool AMDGPULegalizerInfo::legalizeFrint( APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); - auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); - auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); + auto C1 = B.buildFConstant(Ty, C1Val); + auto CopySign = B.buildFCopysign(Ty, C1, Src); // TODO: Should this propagate fast-math-flags? - auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); - auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); + auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); + auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); - auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); - auto Fabs = MIRBuilder.buildFAbs(Ty, Src); + auto C2 = B.buildFConstant(Ty, C2Val); + auto Fabs = B.buildFAbs(Ty, Src); - auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); - MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); + auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); + B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); return true; } @@ -1124,7 +1473,7 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); - HelperBuilder.setMBB(*MI.getParent()); + HelperBuilder.setInstr(MI); return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; } @@ -1187,6 +1536,194 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( return true; } +bool AMDGPULegalizerInfo::legalizeSinCos( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(DstReg); + unsigned Flags = MI.getFlags(); + + Register TrigVal; + auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); + if (ST.hasTrigReducedRange()) { + auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); + TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) + .addUse(MulVal.getReg(0)) + .setMIFlags(Flags).getReg(0); + } else + TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); + + Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? + Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; + B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) + .addUse(TrigVal) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( + Register DstReg, LLT PtrTy, + MachineIRBuilder &B, const GlobalValue *GV, + unsigned Offset, unsigned GAFlags) const { + // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered + // to the following code sequence: + // + // For constant address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // For global address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo + // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // fixups or relocations are emitted to replace $symbol@*@lo and + // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, + // which is a 64-bit pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // What we want here is an offset from the value returned by s_getpc + // (which is the address of the s_add_u32 instruction) to the global + // variable, but since the encoding of $symbol starts 4 bytes after the start + // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order to + // compute the correct address. + + LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + + Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : + B.getMRI()->createGenericVirtualRegister(ConstPtrTy); + + MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) + .addDef(PCReg); + + MIB.addGlobalAddress(GV, Offset + 4, GAFlags); + if (GAFlags == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); + + B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); + + if (PtrTy.getSizeInBits() == 32) + B.buildExtract(DstReg, PCReg, 0); + return true; + } + +bool AMDGPULegalizerInfo::legalizeGlobalValue( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(DstReg); + unsigned AS = Ty.getAddressSpace(); + + const GlobalValue *GV = MI.getOperand(1).getGlobal(); + MachineFunction &MF = B.getMF(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + B.setInstr(MI); + + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { + if (!MFI->isEntryFunction()) { + const Function &Fn = MF.getFunction(); + DiagnosticInfoUnsupported BadLDSDecl( + Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); + Fn.getContext().diagnose(BadLDSDecl); + } + + // TODO: We could emit code to handle the initialization somewhere. + if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { + B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); + MI.eraseFromParent(); + return true; + } + + const Function &Fn = MF.getFunction(); + DiagnosticInfoUnsupported BadInit( + Fn, "unsupported initializer for address space", MI.getDebugLoc()); + Fn.getContext().diagnose(BadInit); + return true; + } + + const SITargetLowering *TLI = ST.getTargetLowering(); + + if (TLI->shouldEmitFixup(GV)) { + buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); + MI.eraseFromParent(); + return true; + } + + if (TLI->shouldEmitPCReloc(GV)) { + buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); + MI.eraseFromParent(); + return true; + } + + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); + + MachineMemOperand *GOTMMO = MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 8 /*Size*/, 8 /*Align*/); + + buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); + + if (Ty.getSizeInBits() == 32) { + // Truncate if this is a 32-bit constant adrdess. + auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); + B.buildExtract(DstReg, Load, 0); + } else + B.buildLoad(DstReg, GOTAddr, *GOTMMO); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeLoad( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, GISelChangeObserver &Observer) const { + B.setInstr(MI); + LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Cast.getReg(0)); + Observer.changedInstr(MI); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFMad( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + assert(Ty.isScalar()); + + // TODO: Always legal with future ftz flag. + if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) + return true; + if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) + return true; + + MachineFunction &MF = B.getMF(); + + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); + HelperBuilder.setMBB(*MI.getParent()); + return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; +} + // Return the use branch instruction, otherwise null if the usage is invalid. static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI) { @@ -1212,10 +1749,9 @@ Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg) const { - if (!Arg->isRegister()) + if (!Arg->isRegister() || !Arg->getRegister().isValid()) return false; // TODO: Handle these - assert(Arg->getRegister() != 0); assert(Arg->getRegister().isPhysical()); MachineRegisterInfo &MRI = *B.getMRI(); @@ -1229,19 +1765,30 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, const unsigned Mask = Arg->getMask(); const unsigned Shift = countTrailingZeros<unsigned>(Mask); - auto ShiftAmt = B.buildConstant(S32, Shift); - auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); - B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); + Register AndMaskSrc = LiveIn; + + if (Shift != 0) { + auto ShiftAmt = B.buildConstant(S32, Shift); + AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); + } + + B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); } else B.buildCopy(DstReg, LiveIn); // Insert the argument copy if it doens't already exist. // FIXME: It seems EmitLiveInCopies isn't called anywhere? if (!MRI.getVRegDef(LiveIn)) { + // FIXME: Should have scoped insert pt + MachineBasicBlock &OrigInsBB = B.getMBB(); + auto OrigInsPt = B.getInsertPt(); + MachineBasicBlock &EntryMBB = B.getMF().front(); EntryMBB.addLiveIn(Arg->getRegister()); B.setInsertPt(EntryMBB, EntryMBB.begin()); B.buildCopy(LiveIn, Arg->getRegister()); + + B.setInsertPt(OrigInsBB, OrigInsPt); } return true; @@ -1272,6 +1819,113 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( return false; } +bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + if (legalizeFastUnsafeFDIV(MI, MRI, B)) + return true; + + return false; +} + +bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + uint16_t Flags = MI.getFlags(); + + LLT ResTy = MRI.getType(Res); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + const MachineFunction &MF = B.getMF(); + bool Unsafe = + MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); + + if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) + return false; + + if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) + return false; + + if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { + // 1 / x -> RCP(x) + if (CLHS->isExactlyValue(1.0)) { + B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) + .addUse(RHS) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; + } + + // -1 / x -> RCP( FNEG(x) ) + if (CLHS->isExactlyValue(-1.0)) { + auto FNeg = B.buildFNeg(ResTy, RHS, Flags); + B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) + .addUse(FNeg.getReg(0)) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; + } + } + + // x / y -> x * (1.0 / y) + if (Unsafe) { + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(RHS) + .setMIFlags(Flags); + B.buildFMul(Res, LHS, RCP, Flags); + + MI.eraseFromParent(); + return true; + } + + return false; +} + +bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(2).getReg(); + Register RHS = MI.getOperand(3).getReg(); + uint16_t Flags = MI.getFlags(); + + LLT S32 = LLT::scalar(32); + LLT S1 = LLT::scalar(1); + + auto Abs = B.buildFAbs(S32, RHS, Flags); + const APFloat C0Val(1.0f); + + auto C0 = B.buildConstant(S32, 0x6f800000); + auto C1 = B.buildConstant(S32, 0x2f800000); + auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); + + auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); + auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); + + auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); + + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) + .addUse(Mul0.getReg(0)) + .setMIFlags(Flags); + + auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); + + B.buildFMul(Res, Sel, Mul1, Flags); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1306,11 +1960,79 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + unsigned AddrSpace) const { + B.setInstr(MI); + Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); + auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); + B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); + MI.eraseFromParent(); + return true; +} + +/// Handle register layout difference for f16 images for some subtargets. +Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register Reg) const { + if (!ST.hasUnpackedD16VMem()) + return Reg; + + const LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + LLT StoreVT = MRI.getType(Reg); + assert(StoreVT.isVector() && StoreVT.getElementType() == S16); + + auto Unmerge = B.buildUnmerge(S16, Reg); + + SmallVector<Register, 4> WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); + + int NumElts = StoreVT.getNumElements(); + + return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); +} + +bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsFormat) const { + // TODO: Reject f16 format on targets where unsupported. + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + B.setInstr(MI); + + const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); + + // Fixup illegal register types for i8 stores. + if (Ty == LLT::scalar(8) || Ty == S16) { + Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); + MI.getOperand(1).setReg(AnyExt); + return true; + } + + if (Ty.isVector()) { + if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { + if (IsFormat) + MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); + return true; + } + + return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; + } + + return Ty == S32; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { // Replace the use G_BRCOND with the exec manipulate and branch pseudos. - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_if: { if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { const SIRegisterInfo *TRI @@ -1386,6 +2108,22 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, case Intrinsic::amdgcn_dispatch_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_ID); + case Intrinsic::amdgcn_fdiv_fast: + return legalizeFDIVFastIntrin(MI, MRI, B); + case Intrinsic::amdgcn_is_shared: + return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); + case Intrinsic::amdgcn_is_private: + return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); + case Intrinsic::amdgcn_wavefrontsize: { + B.setInstr(MI); + B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); + MI.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_raw_buffer_store: + return legalizeRawBufferStore(MI, MRI, B, false); + case Intrinsic::amdgcn_raw_buffer_store_format: + return legalizeRawBufferStore(MI, MRI, B, true); default: return true; } diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 3f1cc1d265dd..d0fba23a8686 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -16,6 +16,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "AMDGPUArgumentUsageInfo.h" +#include "SIInstrInfo.h" namespace llvm { @@ -32,29 +33,44 @@ public: const GCNTargetMachine &TM); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, + MachineIRBuilder &B, GISelChangeObserver &Observer) const override; Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, bool Signed) const; + MachineIRBuilder &B, bool Signed) const; bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; + bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + + bool buildPCRelGlobalAddress( + Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, + unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const; + + bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + GISelChangeObserver &Observer) const; + + bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; Register getLiveInRegister(MachineRegisterInfo &MRI, Register Reg, LLT Ty) const; @@ -65,10 +81,24 @@ public: MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, unsigned AddrSpace) const; + + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register Reg) const; + bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsFormat) const; bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + MachineIRBuilder &B) const override; }; } // End llvm namespace. diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index ce0a9db7c7f4..2c94e0046651 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -48,18 +49,10 @@ static cl::list<std::string> UseNative("amdgpu-use-native", cl::CommaSeparated, cl::ValueOptional, cl::Hidden); -#define MATH_PI 3.14159265358979323846264338327950288419716939937511 -#define MATH_E 2.71828182845904523536028747135266249775724709369996 -#define MATH_SQRT2 1.41421356237309504880168872420969807856967187537695 - -#define MATH_LOG2E 1.4426950408889634073599246810018921374266459541529859 -#define MATH_LOG10E 0.4342944819032518276511289189166050822943970058036665 -// Value of log2(10) -#define MATH_LOG2_10 3.3219280948873623478703194294893901758648313930245806 -// Value of 1 / log2(10) -#define MATH_RLOG2_10 0.3010299956639811952137388947244930267681898814621085 -// Value of 1 / M_LOG2E_F = 1 / log2(e) -#define MATH_RLOG2_E 0.6931471805599453094172321214581765680755001343602552 +#define MATH_PI numbers::pi +#define MATH_E numbers::e +#define MATH_SQRT2 numbers::sqrt2 +#define MATH_SQRT1_2 numbers::inv_sqrt2 namespace llvm { @@ -254,8 +247,8 @@ struct TableEntry { /* a list of {result, input} */ static const TableEntry tbl_acos[] = { - {MATH_PI/2.0, 0.0}, - {MATH_PI/2.0, -0.0}, + {MATH_PI / 2.0, 0.0}, + {MATH_PI / 2.0, -0.0}, {0.0, 1.0}, {MATH_PI, -1.0} }; @@ -271,8 +264,8 @@ static const TableEntry tbl_acospi[] = { static const TableEntry tbl_asin[] = { {0.0, 0.0}, {-0.0, -0.0}, - {MATH_PI/2.0, 1.0}, - {-MATH_PI/2.0, -1.0} + {MATH_PI / 2.0, 1.0}, + {-MATH_PI / 2.0, -1.0} }; static const TableEntry tbl_asinh[] = { {0.0, 0.0}, @@ -287,8 +280,8 @@ static const TableEntry tbl_asinpi[] = { static const TableEntry tbl_atan[] = { {0.0, 0.0}, {-0.0, -0.0}, - {MATH_PI/4.0, 1.0}, - {-MATH_PI/4.0, -1.0} + {MATH_PI / 4.0, 1.0}, + {-MATH_PI / 4.0, -1.0} }; static const TableEntry tbl_atanh[] = { {0.0, 0.0}, @@ -359,7 +352,7 @@ static const TableEntry tbl_log10[] = { }; static const TableEntry tbl_rsqrt[] = { {1.0, 1.0}, - {1.0/MATH_SQRT2, 2.0} + {MATH_SQRT1_2, 2.0} }; static const TableEntry tbl_sin[] = { {0.0, 0.0}, @@ -868,7 +861,7 @@ static double log2(double V) { #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L return ::log2(V); #else - return log(V) / 0.693147180559945309417; + return log(V) / numbers::ln2; #endif } } @@ -1430,8 +1423,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, B.SetInsertPoint(&*ItNew); AllocaInst *Alloc = B.CreateAlloca(RetType, 0, std::string(prefix) + UI->getName()); - Alloc->setAlignment(UCallee->getParent()->getDataLayout() - .getTypeAllocSize(RetType)); + Alloc->setAlignment(MaybeAlign( + UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); return Alloc; } diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index a5bac25701a0..e1ae496d9cbc 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -55,7 +55,7 @@ enum EManglingParam { }; struct ManglingRule { - StringRef const Name; + const char *Name; unsigned char Lead[2]; unsigned char Param[5]; @@ -69,7 +69,7 @@ struct ManglingRule { // Information about library functions with unmangled names. class UnmangledFuncInfo { - StringRef const Name; + const char *Name; unsigned NumArgs; // Table for all lib functions with unmangled names. @@ -82,7 +82,7 @@ class UnmangledFuncInfo { public: using ID = AMDGPULibFunc::EFuncId; - UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs) + constexpr UnmangledFuncInfo(const char *_Name, unsigned _NumArgs) : Name(_Name), NumArgs(_NumArgs) {} // Get index to Table by function name. static bool lookup(StringRef Name, ID &Id); @@ -133,8 +133,8 @@ unsigned ManglingRule::getNumArgs() const { // E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of // prev lead type, etc. see ParamIterator::getNextParam() for details. -static const ManglingRule manglingRules[] = { -{ StringRef(), {0}, {0} }, +static constexpr ManglingRule manglingRules[] = { +{ "", {0}, {0} }, { "abs" , {1}, {E_ANY}}, { "abs_diff" , {1}, {E_ANY,E_COPY}}, { "acos" , {1}, {E_ANY}}, @@ -682,9 +682,9 @@ bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) { } if (eatTerm(FuncName, "_Z")) - F.Impl = make_unique<AMDGPUMangledLibFunc>(); + F.Impl = std::make_unique<AMDGPUMangledLibFunc>(); else - F.Impl = make_unique<AMDGPUUnmangledLibFunc>(); + F.Impl = std::make_unique<AMDGPUUnmangledLibFunc>(); if (F.Impl->parseFuncName(FuncName)) return true; diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 5dd5b3691e0a..e64542a395f0 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -72,10 +72,10 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&*EntryBlock.begin()); - const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary + const Align KernArgBaseAlign(16); // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); - unsigned MaxAlign; + Align MaxAlign; // FIXME: Alignment is broken broken with explicit arg offset.; const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) @@ -94,12 +94,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { for (Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(ArgTy); + unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy); unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); - uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; if (Arg.use_empty()) continue; @@ -128,8 +128,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; - unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, - KernArgBaseAlign); + Align AdjustedAlign = commonAlignment( + KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); Value *ArgPtr; Type *AdjustedArgTy; @@ -160,7 +160,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); LoadInst *Load = - Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value()); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); @@ -220,8 +220,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { } KernArgSegment->addAttribute( - AttributeList::ReturnIndex, - Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + AttributeList::ReturnIndex, + Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ae4c32c258a7..3760aed87a43 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -211,6 +211,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { lowerOperand(MO, MCOp); OutMI.addOperand(MCOp); } + + int FIIdx = AMDGPU::getNamedOperandIdx(MCOpcode, AMDGPU::OpName::fi); + if (FIIdx >= (int)OutMI.getNumOperands()) + OutMI.addOperand(MCOperand::createImm(0)); } bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 237490957058..ba72f71f4322 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -694,7 +694,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { - if (TRI->isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); // If this is a source register to a PHI we are chaining, it @@ -734,7 +734,7 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { - if (TRI->isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); for (auto &UI : MRI->use_operands(Reg)) { @@ -949,7 +949,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, (IncludeLoopPHI && IsLoopPHI); if (ShouldReplace) { - if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + if (Register::isPhysicalRegister(NewRegister)) { LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); @@ -1016,13 +1016,15 @@ bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) { // before are no longer register kills. void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + (void)TRI; // It's used by LLVM_DEBUG. + for (auto MBBI : MBBs) { MachineBasicBlock *MBB = MBBI; for (auto &II : *MBB) { for (auto &RI : II.uses()) { if (RI.isReg()) { - unsigned Reg = RI.getReg(); - if (TRI->isVirtualRegister(Reg)) { + Register Reg = RI.getReg(); + if (Register::isVirtualRegister(Reg)) { if (hasNoDef(Reg, MRI)) continue; if (!MRI->hasOneDef(Reg)) { @@ -1402,7 +1404,7 @@ void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest( unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo( MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) { unsigned DestReg = getPHIDestReg(PHI); - unsigned LinearizeDestReg = + Register LinearizeDestReg = MRI->createVirtualRegister(MRI->getRegClass(DestReg)); PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc()); storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices); @@ -1890,7 +1892,7 @@ void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled( if (!Cond[0].isReg()) return; - unsigned CondReg = Cond[0].getReg(); + Register CondReg = Cond[0].getReg(); for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) { (*UI).setIsKill(false); } @@ -1929,8 +1931,8 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co BBSelectReg, TrueBB->getNumber()); } else { const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg); - unsigned TrueBBReg = MRI->createVirtualRegister(RegClass); - unsigned FalseBBReg = MRI->createVirtualRegister(RegClass); + Register TrueBBReg = MRI->createVirtualRegister(RegClass); + Register FalseBBReg = MRI->createVirtualRegister(RegClass); TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, TrueBBReg, TrueBB->getNumber()); TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, @@ -1996,7 +1998,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI); } const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); - unsigned NextDestReg = MRI->createVirtualRegister(RegClass); + Register NextDestReg = MRI->createVirtualRegister(RegClass); bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; LLVM_DEBUG(dbgs() << "Insert Chained PHI\n"); insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, @@ -2056,8 +2058,8 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, // register, unless it is the outgoing BB select register. We have // already creaed phi nodes for these. const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); - unsigned PHIDestReg = MRI->createVirtualRegister(RegClass); - unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); + Register PHIDestReg = MRI->createVirtualRegister(RegClass); + Register IfSourceReg = MRI->createVirtualRegister(RegClass); // Create initializer, this value is never used, but is needed // to satisfy SSA. LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n"); @@ -2172,7 +2174,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent(); const TargetRegisterClass *RegClass = MRI->getRegClass(CurrentBackedgeReg); - unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass); + Register NewBackedgeReg = MRI->createVirtualRegister(RegClass); MachineInstrBuilder BackedgePHI = BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL, TII->get(TargetOpcode::PHI), NewBackedgeReg); @@ -2230,7 +2232,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, I != E;) { MachineOperand &O = *I; ++I; - if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + if (Register::isPhysicalRegister(NewRegister)) { LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); @@ -2309,7 +2311,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( } else { // Handle internal block. const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); - unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass); + Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass); rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, @@ -2446,7 +2448,7 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, } const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest); - unsigned NewDestReg = MRI->createVirtualRegister(RegClass); + Register NewDestReg = MRI->createVirtualRegister(RegClass); LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI); MachineInstrBuilder MIB = BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), @@ -2734,9 +2736,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { } const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI()); unsigned InReg = LRegion->getBBSelectRegIn(); - unsigned InnerSelectReg = + Register InnerSelectReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); - unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); + Register NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); TII->materializeImmediate(*(LRegion->getEntry()), LRegion->getEntry()->getFirstTerminator(), DL, NewInReg, Region->getEntry()->getNumber()); diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 0d3a1f1a769f..89ca702f577d 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -17,7 +17,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), LocalMemoryObjects(), ExplicitKernArgSize(0), - MaxKernArgAlign(0), LDSSize(0), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 52987e2fa411..9818ab1ef148 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -23,7 +23,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { protected: uint64_t ExplicitKernArgSize; // Cache for this. - unsigned MaxKernArgAlign; // Cache for this. + Align MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. unsigned LDSSize; @@ -47,9 +47,7 @@ public: return ExplicitKernArgSize; } - unsigned getMaxKernArgAlign() const { - return MaxKernArgAlign; - } + unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); } unsigned getLDSSize() const { return LDSSize; diff --git a/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp new file mode 100644 index 000000000000..5250bf455d71 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -0,0 +1,592 @@ +//=== AMDGPUPrintfRuntimeBinding.cpp - OpenCL printf implementation -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// +// The pass bind printfs to a kernel arg pointer that will be bound to a buffer +// later by the runtime. +// +// This pass traverses the functions in the module and converts +// each call to printf to a sequence of operations that +// store the following into the printf buffer: +// - format string (passed as a module's metadata unique ID) +// - bitwise copies of printf arguments +// The backend passes will need to store metadata in the kernel +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "printfToRuntime" +#define DWORD_ALIGN 4 + +namespace { +class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final + : public ModulePass { + +public: + static char ID; + + explicit AMDGPUPrintfRuntimeBinding(); + +private: + bool runOnModule(Module &M) override; + void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers, + StringRef fmt, size_t num_ops) const; + + bool shouldPrintAsStr(char Specifier, Type *OpType) const; + bool + lowerPrintfForGpu(Module &M, + function_ref<const TargetLibraryInfo &(Function &)> GetTLI); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + } + + Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) { + return SimplifyInstruction(I, {*TD, TLI, DT}); + } + + const DataLayout *TD; + const DominatorTree *DT; + SmallVector<CallInst *, 32> Printfs; +}; +} // namespace + +char AMDGPUPrintfRuntimeBinding::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding, + "amdgpu-printf-runtime-binding", "AMDGPU Printf lowering", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding", + "AMDGPU Printf lowering", false, false) + +char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID; + +namespace llvm { +ModulePass *createAMDGPUPrintfRuntimeBinding() { + return new AMDGPUPrintfRuntimeBinding(); +} +} // namespace llvm + +AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() + : ModulePass(ID), TD(nullptr), DT(nullptr) { + initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry()); +} + +void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers( + SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt, + size_t NumOps) const { + // not all format characters are collected. + // At this time the format characters of interest + // are %p and %s, which use to know if we + // are either storing a literal string or a + // pointer to the printf buffer. + static const char ConvSpecifiers[] = "cdieEfgGaosuxXp"; + size_t CurFmtSpecifierIdx = 0; + size_t PrevFmtSpecifierIdx = 0; + + while ((CurFmtSpecifierIdx = Fmt.find_first_of( + ConvSpecifiers, CurFmtSpecifierIdx)) != StringRef::npos) { + bool ArgDump = false; + StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx, + CurFmtSpecifierIdx - PrevFmtSpecifierIdx); + size_t pTag = CurFmt.find_last_of("%"); + if (pTag != StringRef::npos) { + ArgDump = true; + while (pTag && CurFmt[--pTag] == '%') { + ArgDump = !ArgDump; + } + } + + if (ArgDump) + OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]); + + PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx; + } +} + +bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier, + Type *OpType) const { + if (Specifier != 's') + return false; + const PointerType *PT = dyn_cast<PointerType>(OpType); + if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return false; + Type *ElemType = PT->getContainedType(0); + if (ElemType->getTypeID() != Type::IntegerTyID) + return false; + IntegerType *ElemIType = cast<IntegerType>(ElemType); + return ElemIType->getBitWidth() == 8; +} + +bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( + Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32Ty = Type::getInt32Ty(Ctx); + unsigned UniqID = 0; + // NB: This is important for this string size to be divizable by 4 + const char NonLiteralStr[4] = "???"; + + for (auto CI : Printfs) { + unsigned NumOps = CI->getNumArgOperands(); + + SmallString<16> OpConvSpecifiers; + Value *Op = CI->getArgOperand(0); + + if (auto LI = dyn_cast<LoadInst>(Op)) { + Op = LI->getPointerOperand(); + for (auto Use : Op->users()) { + if (auto SI = dyn_cast<StoreInst>(Use)) { + Op = SI->getValueOperand(); + break; + } + } + } + + if (auto I = dyn_cast<Instruction>(Op)) { + Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction())); + if (Op_simplified) + Op = Op_simplified; + } + + ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Op); + + if (ConstExpr) { + GlobalVariable *GVar = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); + + StringRef Str("unknown"); + if (GVar && GVar->hasInitializer()) { + auto Init = GVar->getInitializer(); + if (auto CA = dyn_cast<ConstantDataArray>(Init)) { + if (CA->isString()) + Str = CA->getAsCString(); + } else if (isa<ConstantAggregateZero>(Init)) { + Str = ""; + } + // + // we need this call to ascertain + // that we are printing a string + // or a pointer. It takes out the + // specifiers and fills up the first + // arg + getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1); + } + // Add metadata for the string + std::string AStreamHolder; + raw_string_ostream Sizes(AStreamHolder); + int Sum = DWORD_ALIGN; + Sizes << CI->getNumArgOperands() - 1; + Sizes << ':'; + for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() && + ArgCount <= OpConvSpecifiers.size(); + ArgCount++) { + Value *Arg = CI->getArgOperand(ArgCount); + Type *ArgType = Arg->getType(); + unsigned ArgSize = TD->getTypeAllocSizeInBits(ArgType); + ArgSize = ArgSize / 8; + // + // ArgSize by design should be a multiple of DWORD_ALIGN, + // expand the arguments that do not follow this rule. + // + if (ArgSize % DWORD_ALIGN != 0) { + llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx); + VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType); + int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1; + if (LLVMVecType && NumElem > 1) + ResType = llvm::VectorType::get(ResType, NumElem); + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + if (OpConvSpecifiers[ArgCount - 1] == 'x' || + OpConvSpecifiers[ArgCount - 1] == 'X' || + OpConvSpecifiers[ArgCount - 1] == 'u' || + OpConvSpecifiers[ArgCount - 1] == 'o') + Arg = Builder.CreateZExt(Arg, ResType); + else + Arg = Builder.CreateSExt(Arg, ResType); + ArgType = Arg->getType(); + ArgSize = TD->getTypeAllocSizeInBits(ArgType); + ArgSize = ArgSize / 8; + CI->setOperand(ArgCount, Arg); + } + if (OpConvSpecifiers[ArgCount - 1] == 'f') { + ConstantFP *FpCons = dyn_cast<ConstantFP>(Arg); + if (FpCons) + ArgSize = 4; + else { + FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg); + if (FpExt && FpExt->getType()->isDoubleTy() && + FpExt->getOperand(0)->getType()->isFloatTy()) + ArgSize = 4; + } + } + if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { + if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) { + GlobalVariable *GV = + dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); + if (GV && GV->hasInitializer()) { + Constant *Init = GV->getInitializer(); + ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init); + if (Init->isZeroValue() || CA->isString()) { + size_t SizeStr = Init->isZeroValue() + ? 1 + : (strlen(CA->getAsCString().data()) + 1); + size_t Rem = SizeStr % DWORD_ALIGN; + size_t NSizeStr = 0; + LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr + << '\n'); + if (Rem) { + NSizeStr = SizeStr + (DWORD_ALIGN - Rem); + } else { + NSizeStr = SizeStr; + } + ArgSize = NSizeStr; + } + } else { + ArgSize = sizeof(NonLiteralStr); + } + } else { + ArgSize = sizeof(NonLiteralStr); + } + } + LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize + << " for type: " << *ArgType << '\n'); + Sizes << ArgSize << ':'; + Sum += ArgSize; + } + LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str() + << '\n'); + for (size_t I = 0; I < Str.size(); ++I) { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser + switch (Str[I]) { + case '\a': + Sizes << "\\a"; + break; + case '\b': + Sizes << "\\b"; + break; + case '\f': + Sizes << "\\f"; + break; + case '\n': + Sizes << "\\n"; + break; + case '\r': + Sizes << "\\r"; + break; + case '\v': + Sizes << "\\v"; + break; + case ':': + // ':' cannot be scanned by Flex, as it is defined as a delimiter + // Replace it with it's octal representation \72 + Sizes << "\\72"; + break; + default: + Sizes << Str[I]; + break; + } + } + + // Insert the printf_alloc call + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex, + Attribute::NoUnwind); + + Type *SizetTy = Type::getInt32Ty(Ctx); + + Type *Tys_alloc[1] = {SizetTy}; + Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1); + FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false); + FunctionCallee PrintfAllocFn = + M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr); + + LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes.str() << '\n'); + std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str().c_str(); + MDString *fmtStrArray = MDString::get(Ctx, fmtstr); + + // Instead of creating global variables, the + // printf format strings are extracted + // and passed as metadata. This avoids + // polluting llvm's symbol tables in this module. + // Metadata is going to be extracted + // by the backend passes and inserted + // into the OpenCL binary as appropriate. + StringRef amd("llvm.printf.fmts"); + NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd); + MDNode *myMD = MDNode::get(Ctx, fmtStrArray); + metaD->addOperand(myMD); + Value *sumC = ConstantInt::get(SizetTy, Sum, false); + SmallVector<Value *, 1> alloc_args; + alloc_args.push_back(sumC); + CallInst *pcall = + CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI); + + // + // Insert code to split basicblock with a + // piece of hammock code. + // basicblock splits after buffer overflow check + // + ConstantPointerNull *zeroIntPtr = + ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1)); + ICmpInst *cmp = + dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, "")); + if (!CI->use_empty()) { + Value *result = + Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res"); + CI->replaceAllUsesWith(result); + } + SplitBlock(CI->getParent(), cmp); + Instruction *Brnch = + SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false); + + Builder.SetInsertPoint(Brnch); + + // store unique printf id in the buffer + // + SmallVector<Value *, 1> ZeroIdxList; + ConstantInt *zeroInt = + ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10)); + ZeroIdxList.push_back(zeroInt); + + GetElementPtrInst *BufferIdx = + dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create( + nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch)); + + Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS); + Value *id_gep_cast = + new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch); + + StoreInst *stbuff = + new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast); + stbuff->insertBefore(Brnch); // to Remove unused variable warning + + SmallVector<Value *, 2> FourthIdxList; + ConstantInt *fourInt = + ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10)); + + FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id + // the following GEP is the buffer pointer + BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create( + nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch)); + + Type *Int32Ty = Type::getInt32Ty(Ctx); + Type *Int64Ty = Type::getInt64Ty(Ctx); + for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() && + ArgCount <= OpConvSpecifiers.size(); + ArgCount++) { + Value *Arg = CI->getArgOperand(ArgCount); + Type *ArgType = Arg->getType(); + SmallVector<Value *, 32> WhatToStore; + if (ArgType->isFPOrFPVectorTy() && + (ArgType->getTypeID() != Type::VectorTyID)) { + Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty; + if (OpConvSpecifiers[ArgCount - 1] == 'f') { + ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg); + if (fpCons) { + APFloat Val(fpCons->getValueAPF()); + bool Lost = false; + Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, + &Lost); + Arg = ConstantFP::get(Ctx, Val); + IType = Int32Ty; + } else { + FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg); + if (FpExt && FpExt->getType()->isDoubleTy() && + FpExt->getOperand(0)->getType()->isFloatTy()) { + Arg = FpExt->getOperand(0); + IType = Int32Ty; + } + } + } + Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch); + WhatToStore.push_back(Arg); + } else if (ArgType->getTypeID() == Type::PointerTyID) { + if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { + const char *S = NonLiteralStr; + if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) { + GlobalVariable *GV = + dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); + if (GV && GV->hasInitializer()) { + Constant *Init = GV->getInitializer(); + ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init); + if (Init->isZeroValue() || CA->isString()) { + S = Init->isZeroValue() ? "" : CA->getAsCString().data(); + } + } + } + size_t SizeStr = strlen(S) + 1; + size_t Rem = SizeStr % DWORD_ALIGN; + size_t NSizeStr = 0; + if (Rem) { + NSizeStr = SizeStr + (DWORD_ALIGN - Rem); + } else { + NSizeStr = SizeStr; + } + if (S[0]) { + char *MyNewStr = new char[NSizeStr](); + strcpy(MyNewStr, S); + int NumInts = NSizeStr / 4; + int CharC = 0; + while (NumInts) { + int ANum = *(int *)(MyNewStr + CharC); + CharC += 4; + NumInts--; + Value *ANumV = ConstantInt::get(Int32Ty, ANum, false); + WhatToStore.push_back(ANumV); + } + delete[] MyNewStr; + } else { + // Empty string, give a hint to RT it is no NULL + Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false); + WhatToStore.push_back(ANumV); + } + } else { + uint64_t Size = TD->getTypeAllocSizeInBits(ArgType); + assert((Size == 32 || Size == 64) && "unsupported size"); + Type *DstType = (Size == 32) ? Int32Ty : Int64Ty; + Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch); + WhatToStore.push_back(Arg); + } + } else if (ArgType->getTypeID() == Type::VectorTyID) { + Type *IType = NULL; + uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements(); + uint32_t EleSize = ArgType->getScalarSizeInBits(); + uint32_t TotalSize = EleCount * EleSize; + if (EleCount == 3) { + IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext()); + Constant *Indices[4] = { + ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1), + ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)}; + Constant *Mask = ConstantVector::get(Indices); + ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask); + Shuffle->insertBefore(Brnch); + Arg = Shuffle; + ArgType = Arg->getType(); + TotalSize += EleSize; + } + switch (EleSize) { + default: + EleCount = TotalSize / 64; + IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext())); + break; + case 8: + if (EleCount >= 8) { + EleCount = TotalSize / 64; + IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext())); + } else if (EleCount >= 3) { + EleCount = 1; + IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext())); + } else { + EleCount = 1; + IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext())); + } + break; + case 16: + if (EleCount >= 3) { + EleCount = TotalSize / 64; + IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext())); + } else { + EleCount = 1; + IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext())); + } + break; + } + if (EleCount > 1) { + IType = dyn_cast<Type>(VectorType::get(IType, EleCount)); + } + Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch); + WhatToStore.push_back(Arg); + } else { + WhatToStore.push_back(Arg); + } + for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) { + Value *TheBtCast = WhatToStore[I]; + unsigned ArgSize = + TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8; + SmallVector<Value *, 1> BuffOffset; + BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize)); + + Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1); + Value *CastedGEP = + new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch); + StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch); + LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n" + << *StBuff << '\n'); + (void)StBuff; + if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands()) + break; + BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create( + nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch)); + LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n" + << *BufferIdx << '\n'); + } + } + } + } + + // erase the printf calls + for (auto CI : Printfs) + CI->eraseFromParent(); + + Printfs.clear(); + return true; +} + +bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + if (TT.getArch() == Triple::r600) + return false; + + auto PrintfFunction = M.getFunction("printf"); + if (!PrintfFunction) + return false; + + for (auto &U : PrintfFunction->uses()) { + if (auto *CI = dyn_cast<CallInst>(U.getUser())) { + if (CI->isCallee(&U)) + Printfs.push_back(CI); + } + } + + if (Printfs.empty()) + return false; + + TD = &M.getDataLayout(); + auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; + + return lowerPrintfForGpu(M, GetTLI); +} diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index e4c9d6685d4a..3e9dcca114a3 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -801,7 +801,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(I.getAlignment()); + GV->setAlignment(MaybeAlign(I.getAlignment())); Value *TCntY, *TCntZ; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 815cbc5e26ee..4d78188b3dc3 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -17,9 +17,9 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -33,6 +33,7 @@ #include "AMDGPUGenRegisterBankInfo.def" using namespace llvm; +using namespace MIPatternMatch; namespace { @@ -84,9 +85,11 @@ public: }; } -AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) +AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterBankInfo(), - TRI(static_cast<const SIRegisterInfo*>(&TRI)) { + Subtarget(ST), + TRI(Subtarget.getRegisterInfo()), + TII(Subtarget.getInstrInfo()) { // HACK: Until this is fully tablegen'd. static bool AlreadyInit = false; @@ -163,11 +166,10 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( const TargetRegisterClass &RC) const { + if (&RC == &AMDGPU::SReg_1RegClass) + return AMDGPU::VCCRegBank; - if (TRI->isSGPRClass(&RC)) - return getRegBank(AMDGPU::SGPRRegBankID); - - return getRegBank(AMDGPU::VGPRRegBankID); + return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank; } template <unsigned NumOps> @@ -192,7 +194,8 @@ AMDGPURegisterBankInfo::addMappingFromTable( Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); } - unsigned MappingID = 0; + // getInstrMapping's default mapping uses ID 1, so start at 2. + unsigned MappingID = 2; for (const auto &Entry : Table) { for (unsigned I = 0; I < NumOps; ++I) { int OpIdx = RegSrcOpIdx[I]; @@ -210,7 +213,7 @@ AMDGPURegisterBankInfo::addMappingFromTable( RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_readlane: { static const OpRegBankEntry<3> Table[2] = { // Perfectly legal. @@ -251,7 +254,7 @@ RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_buffer_load: { static const OpRegBankEntry<3> Table[4] = { // Perfectly legal. @@ -303,6 +306,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { + // FIXME: Should have no register for immediate static const OpRegBankEntry<1> Table[2] = { // Perfectly legal. { { AMDGPU::SGPRRegBankID }, 1 }, @@ -319,12 +323,15 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( } } -static bool isInstrUniform(const MachineInstr &MI) { +// FIXME: Returns uniform if there's no source value information. This is +// probably wrong. +static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) { if (!MI.hasOneMemOperand()) return false; const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); + return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && + AMDGPUInstrInfo::isUniformMMO(MMO); } RegisterBankInfo::InstructionMappings @@ -337,6 +344,31 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; switch (MI.getOpcode()) { + case TargetOpcode::G_CONSTANT: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + if (Size == 1) { + static const OpRegBankEntry<1> Table[4] = { + { { AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID }, 1 }, + { { AMDGPU::VCCRegBankID }, 1 }, + { { AMDGPU::SCCRegBankID }, 1 } + }; + + return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); + } + + LLVM_FALLTHROUGH; + } + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FRAME_INDEX: + case TargetOpcode::G_GLOBAL_VALUE: { + static const OpRegBankEntry<1> Table[2] = { + { { AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID }, 1 } + }; + + return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); + } case TargetOpcode::G_AND: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: { @@ -408,23 +440,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&VSMapping); break; } - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_LOAD: + case TargetOpcode::G_ZEXTLOAD: + case TargetOpcode::G_SEXTLOAD: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + unsigned PtrSize = PtrTy.getSizeInBits(); + unsigned AS = PtrTy.getAddressSpace(); LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); - // FIXME: Should we be hard coding the size for these mappings? - if (isInstrUniform(MI)) { + if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && + AS != AMDGPUAS::PRIVATE_ADDRESS) && + isInstrUniformNonExtLoadAlign4(MI)) { const InstructionMapping &SSMapping = getInstructionMapping( 1, 1, getOperandsMapping( {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 2); // Num Operands AltMappings.push_back(&SSMapping); } const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 2); // Num Operands AltMappings.push_back(&VVMapping); @@ -620,57 +658,53 @@ static LLT getHalfSizedType(LLT Ty) { /// /// There is additional complexity to try for compare values to identify the /// unique values used. -void AMDGPURegisterBankInfo::executeInWaterfallLoop( - MachineInstr &MI, MachineRegisterInfo &MRI, - ArrayRef<unsigned> OpIndices) const { - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineBasicBlock::iterator I(MI); - - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - // Use a set to avoid extra readfirstlanes in the case where multiple operands - // are the same register. - SmallSet<Register, 4> SGPROperandRegs; - for (unsigned Op : OpIndices) { - assert(MI.getOperand(Op).isUse()); - Register Reg = MI.getOperand(Op).getReg(); - const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); - if (OpBank->getID() == AMDGPU::VGPRRegBankID) - SGPROperandRegs.insert(Reg); - } - - // No operands need to be replaced, so no need to loop. - if (SGPROperandRegs.empty()) - return; - - MachineIRBuilder B(MI); +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs, + MachineRegisterInfo &MRI) const { SmallVector<Register, 4> ResultRegs; SmallVector<Register, 4> InitResultRegs; SmallVector<Register, 4> PhiRegs; - for (MachineOperand &Def : MI.defs()) { - LLT ResTy = MRI.getType(Def.getReg()); - const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); - ResultRegs.push_back(Def.getReg()); - Register InitReg = B.buildUndef(ResTy).getReg(0); - Register PhiReg = MRI.createGenericVirtualRegister(ResTy); - InitResultRegs.push_back(InitReg); - PhiRegs.push_back(PhiReg); - MRI.setRegBank(PhiReg, *DefBank); - MRI.setRegBank(InitReg, *DefBank); + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction *MF = &B.getMF(); + + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + const unsigned WaveAndOpc = Subtarget.isWave32() ? + AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned MovTermOpc = Subtarget.isWave32() ? + AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned XorTermOpc = Subtarget.isWave32() ? + AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + const unsigned AndSaveExecOpc = Subtarget.isWave32() ? + AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + const unsigned ExecReg = Subtarget.isWave32() ? + AMDGPU::EXEC_LO : AMDGPU::EXEC; + + for (MachineInstr &MI : Range) { + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } } - Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + Register SaveExecReg = MRI.createVirtualRegister(WaveRC); + Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); // Don't bother using generic instructions/registers for the exec mask. B.buildInstr(TargetOpcode::IMPLICIT_DEF) .addDef(InitSaveExecReg); - Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register PhiExec = MRI.createVirtualRegister(WaveRC); + Register NewExec = MRI.createVirtualRegister(WaveRC); // To insert the loop we need to split the block. Move everything before this // point to a new block, and insert a new empty block before this instruction. @@ -688,7 +722,7 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); MBB.addSuccessor(LoopBB); RestoreExecBB->addSuccessor(RemainderBB); @@ -711,164 +745,173 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); } - // Move the instruction into the loop. - LoopBB->splice(LoopBB->end(), &MBB, I); - I = std::prev(LoopBB->end()); + const DebugLoc &DL = B.getDL(); + + // Figure out the iterator range after splicing the instructions. + auto NewBegin = std::prev(LoopBB->end()); - B.setInstr(*I); + // Move the instruction into the loop. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); + + auto NewEnd = LoopBB->end(); + + MachineBasicBlock::iterator I = Range.begin(); + B.setInsertPt(*LoopBB, I); Register CondReg; - for (MachineOperand &Op : MI.uses()) { - if (!Op.isReg()) - continue; + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg() || Op.isDef()) + continue; - assert(!Op.isDef()); - if (SGPROperandRegs.count(Op.getReg())) { - LLT OpTy = MRI.getType(Op.getReg()); - unsigned OpSize = OpTy.getSizeInBits(); + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) - .addReg(Op.getReg()); + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); - Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); - Op.setReg(CurrentLaneOpReg); + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); - if (!First) { - Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } else { - LLT S32 = LLT::scalar(32); - SmallVector<Register, 8> ReadlanePieces; + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } else { + LLT S32 = LLT::scalar(32); + SmallVector<Register, 8> ReadlanePieces; - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. - bool Is64 = OpSize % 64 == 0; + bool Is64 = OpSize % 64 == 0; - LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); - unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 - : AMDGPU::V_CMP_EQ_U32_e64; + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. - // Insert the unmerge before the loop. + // Insert the unmerge before the loop. - B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); - B.setInstr(*I); + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); - unsigned NumPieces = Unmerge->getNumOperands() - 1; - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { - unsigned UnmergePiece = Unmerge.getReg(PieceIdx); + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + Register UnmergePiece = Unmerge.getReg(PieceIdx); - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); - CurrentLaneOpReg = + CurrentLaneOpReg = B.buildMerge(LLT::scalar(64), {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); + .getReg(0); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); + CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; - Register NewCondReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); - B.buildInstr(CmpOp) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); - if (!First) { - Register AndReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); } - } - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } else { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); } - - MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); } } } @@ -876,16 +919,16 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*LoopBB, LoopBB->end()); // Update EXEC, save the original EXEC value to VCC. - B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + B.buildInstr(AndSaveExecOpc) .addDef(NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - B.buildInstr(AMDGPU::S_XOR_B64_term) - .addDef(AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + B.buildInstr(XorTermOpc) + .addDef(ExecReg) + .addReg(ExecReg) .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use @@ -896,14 +939,60 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); // Save the EXEC mask before the loop. - BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) + .addReg(ExecReg); // Restore the EXEC mask after the loop. B.setMBB(*RestoreExecBB); - B.buildInstr(AMDGPU::S_MOV_B64_term) - .addDef(AMDGPU::EXEC) + B.buildInstr(MovTermOpc) + .addDef(ExecReg) .addReg(SaveExecReg); + + // Restore the insert point before the original instruction. + B.setInsertPt(MBB, MBB.end()); + + return true; +} + +// Return any unique registers used by \p MI at \p OpIndices that need to be +// handled in a waterfall loop. Returns these registers in \p +// SGPROperandRegs. Returns true if there are any operansd to handle and a +// waterfall loop is necessary. +bool AMDGPURegisterBankInfo::collectWaterfallOperands( + SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, + MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + return !SGPROperandRegs.empty(); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const { + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet<Register, 4> SGPROperandRegs; + + if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) + return false; + + MachineBasicBlock::iterator I = MI.getIterator(); + return executeInWaterfallLoop(B, make_range(I, std::next(I)), + SGPROperandRegs, MRI); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const { + MachineIRBuilder B(MI); + return executeInWaterfallLoop(B, MI, MRI, OpIndices); } // Legalize an operand that must be an SGPR by inserting a readfirstlane. @@ -960,8 +1049,13 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); // If the pointer is an SGPR, we have nothing to do. - if (SrcRegs.empty()) - return false; + if (SrcRegs.empty()) { + Register PtrReg = MI.getOperand(1).getReg(); + const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); + if (PtrBank == &AMDGPU::SGPRRegBank) + return false; + SrcRegs.push_back(PtrReg); + } assert(LoadSize % MaxNonSmrdLoadSize == 0); @@ -1013,6 +1107,33 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, return true; } +bool AMDGPURegisterBankInfo::applyMappingImage( + MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI, int RsrcIdx) const { + const int NumDefs = MI.getNumExplicitDefs(); + + // The reported argument index is relative to the IR intrinsic call arguments, + // so we need to shift by the number of defs and the intrinsic ID. + RsrcIdx += NumDefs + 1; + + // Insert copies to VGPR arguments. + applyDefaultMapping(OpdMapper); + + // Fixup any SGPR arguments. + SmallVector<unsigned, 4> SGPRIndexes; + for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { + if (!MI.getOperand(I).isReg()) + continue; + + // If this intrinsic has a sampler, it immediately follows rsrc. + if (I == RsrcIdx || I == RsrcIdx + 1) + SGPRIndexes.push_back(I); + } + + executeInWaterfallLoop(MI, MRI, SGPRIndexes); + return true; +} + // For cases where only a single copy is inserted for matching register banks. // Replace the register in the instruction operand static void substituteSimpleCopyRegs( @@ -1024,6 +1145,184 @@ static void substituteSimpleCopyRegs( } } +/// Handle register layout difference for f16 images for some subtargets. +Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register Reg) const { + if (!Subtarget.hasUnpackedD16VMem()) + return Reg; + + const LLT S16 = LLT::scalar(16); + LLT StoreVT = MRI.getType(Reg); + if (!StoreVT.isVector() || StoreVT.getElementType() != S16) + return Reg; + + auto Unmerge = B.buildUnmerge(S16, Reg); + + + SmallVector<Register, 4> WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(Unmerge.getReg(I)); + + const LLT S32 = LLT::scalar(32); + int NumElts = StoreVT.getNumElements(); + + return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); +} + +static std::pair<Register, unsigned> +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + int64_t Const; + if (mi_match(Reg, MRI, m_ICst(Const))) + return std::make_pair(Register(), Const); + + Register Base; + if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) + return std::make_pair(Base, Const); + + // TODO: Handle G_OR used for add case + return std::make_pair(Reg, 0); +} + +std::pair<Register, unsigned> +AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned ImmOffset; + const LLT S32 = LLT::scalar(32); + + std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), + OrigOffset); + + unsigned C1 = 0; + if (ImmOffset != 0) { + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + C1 = ImmOffset; + if (Overflow != 0) { + if (!BaseReg) + BaseReg = B.buildConstant(S32, Overflow).getReg(0); + else { + auto OverflowVal = B.buildConstant(S32, Overflow); + BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); + } + } + } + + if (!BaseReg) + BaseReg = B.buildConstant(S32, 0).getReg(0); + + return {BaseReg, C1}; +} + +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + return mi_match(Reg, MRI, m_ICst(C)) && C == 0; +} + +static unsigned extractGLC(unsigned CachePolicy) { + return CachePolicy & 1; +} + +static unsigned extractSLC(unsigned CachePolicy) { + return (CachePolicy >> 1) & 1; +} + +static unsigned extractDLC(unsigned CachePolicy) { + return (CachePolicy >> 2) & 1; +} + +MachineInstr * +AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, + MachineInstr &MI) const { + MachineRegisterInfo &MRI = *B.getMRI(); + executeInWaterfallLoop(B, MI, MRI, {2, 4}); + + // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. + + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + int EltSize = Ty.getScalarSizeInBits(); + int Size = Ty.getSizeInBits(); + + // FIXME: Broken integer truncstore. + if (EltSize != 32) + report_fatal_error("unhandled intrinsic store"); + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + const int MemSize = (*MI.memoperands_begin())->getSize(); + + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned CachePolicy = MI.getOperand(5).getImm(); + + unsigned ImmOffset; + std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); + + const bool Offen = !isZero(VOffset, MRI); + + unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; + switch (8 * MemSize) { + case 8: + Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + break; + case 16: + Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + break; + default: + Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + break; + } + + + // Set the insertion point back to the instruction in case it was moved into a + // loop. + B.setInstr(MI); + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(CachePolicy)) + .addImm(extractSLC(CachePolicy)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(CachePolicy)) + .cloneMemRefs(MI); + + // FIXME: We need a way to report failure from applyMappingImpl. + // Insert constrain copies before inserting the loop. + if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) + report_fatal_error("failed to constrain selected store intrinsic"); + + return MIB; +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -1289,12 +1588,202 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } - case AMDGPU::G_EXTRACT_VECTOR_ELT: - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 2 }); + case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy != LLT::vector(2, 16)) + break; + + assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); + substituteSimpleCopyRegs(OpdMapper, 1); + substituteSimpleCopyRegs(OpdMapper, 2); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + if (DstBank == &AMDGPU::SGPRRegBank) + break; // Can use S_PACK_* instructions. + + MachineIRBuilder B(MI); + + Register Lo = MI.getOperand(1).getReg(); + Register Hi = MI.getOperand(2).getReg(); + const LLT S32 = LLT::scalar(32); + + const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI); + const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI); + + Register ZextLo; + Register ShiftHi; + + if (Opc == AMDGPU::G_BUILD_VECTOR) { + ZextLo = B.buildZExt(S32, Lo).getReg(0); + MRI.setRegBank(ZextLo, *BankLo); + + Register ZextHi = B.buildZExt(S32, Hi).getReg(0); + MRI.setRegBank(ZextHi, *BankHi); + + auto ShiftAmt = B.buildConstant(S32, 16); + MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); + + ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); + MRI.setRegBank(ShiftHi, *BankHi); + } else { + Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); + MRI.setRegBank(MaskLo, *BankLo); + + auto ShiftAmt = B.buildConstant(S32, 16); + MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); + + ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); + MRI.setRegBank(ShiftHi, *BankHi); + + ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); + MRI.setRegBank(ZextLo, *BankLo); + } + + auto Or = B.buildOr(S32, ZextLo, ShiftHi); + MRI.setRegBank(Or.getReg(0), *DstBank); + + B.buildBitcast(DstReg, Or); + MI.eraseFromParent(); return; + } + case AMDGPU::G_EXTRACT_VECTOR_ELT: { + SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); + + assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); + + if (DstRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register IdxReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + (void)DstTy; + + assert(DstTy.getSizeInBits() == 64); + + LLT SrcTy = MRI.getType(SrcReg); + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); + B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); + + const ValueMapping &DstMapping + = OpdMapper.getInstrMapping().getOperandMapping(0); + + // FIXME: Should be getting from mapping or not? + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + SmallSet<Register, 4> OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { + MI.eraseFromParent(); + return; + } + + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + return; + } + case AMDGPU::G_INSERT_VECTOR_ELT: { + SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); + + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(1).empty()); + assert(OpdMapper.getVRegs(3).empty()); + + if (InsRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 3 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT InsTy = MRI.getType(InsReg); + (void)InsTy; + + assert(InsTy.getSizeInBits() == 64); + + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + + auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); + auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); + B.buildBitcast(DstReg, InsHi); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI); + + MRI.setRegBank(InsReg, *InsSrcBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(InsLo.getReg(0), *DstBank); + MRI.setRegBank(InsHi.getReg(0), *DstBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + + SmallSet<Register, 4> OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { + MI.eraseFromParent(); + return; + } + + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + return; + } case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS executeInWaterfallLoop(MI, MRI, { 2, 3 }); @@ -1303,8 +1792,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_readlane: { substituteSimpleCopyRegs(OpdMapper, 2); - assert(empty(OpdMapper.getVRegs(0))); - assert(empty(OpdMapper.getVRegs(3))); + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(3).empty()); // Make sure the index is an SGPR. It doesn't make sense to run this in a // waterfall loop, so assume it's a uniform value. @@ -1312,9 +1801,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_writelane: { - assert(empty(OpdMapper.getVRegs(0))); - assert(empty(OpdMapper.getVRegs(2))); - assert(empty(OpdMapper.getVRegs(3))); + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(2).empty()); + assert(OpdMapper.getVRegs(3).empty()); substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val constrainOpWithReadfirstlane(MI, MRI, 2); // Source value @@ -1327,7 +1816,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + auto IntrID = MI.getIntrinsicID(); + switch (IntrID) { case Intrinsic::amdgcn_buffer_load: { executeInWaterfallLoop(MI, MRI, { 2 }); return; @@ -1335,23 +1825,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { // This is only allowed to execute with 1 lane, so readfirstlane is safe. - assert(empty(OpdMapper.getVRegs(0))); + assert(OpdMapper.getVRegs(0).empty()); substituteSimpleCopyRegs(OpdMapper, 3); constrainOpWithReadfirstlane(MI, MRI, 2); // M0 return; } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_br: { + // Only the first lane is executes, so readfirstlane is safe. + substituteSimpleCopyRegs(OpdMapper, 1); + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + // Only the first lane is executes, so readfirstlane is safe. + constrainOpWithReadfirstlane(MI, MRI, 1); // M0 + return; + } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? constrainOpWithReadfirstlane(MI, MRI, 2); // M0 return; } - default: + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {2, 4}); + return; + } + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_struct_tbuffer_store: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {2, 5}); + return; + } + default: { + if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = + AMDGPU::lookupRsrcIntrinsic(IntrID)) { + // Non-images can have complications from operands that allow both SGPR + // and VGPR. For now it's too complicated to figure out the final opcode + // to derive the register bank from the MCInstrDesc. + if (RSrcIntrin->IsImage) { + applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); + return; + } + } + break; } + } break; } - case AMDGPU::G_LOAD: { + case AMDGPU::G_LOAD: + case AMDGPU::G_ZEXTLOAD: + case AMDGPU::G_SEXTLOAD: { if (applyMappingWideLoad(MI, OpdMapper, MRI)) return; break; @@ -1452,25 +1989,71 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { } const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, + const MachineInstr &MI, + int RsrcIdx) const { + // The reported argument index is relative to the IR intrinsic call arguments, + // so we need to shift by the number of defs and the intrinsic ID. + RsrcIdx += MI.getNumExplicitDefs() + 1; + + const int NumOps = MI.getNumOperands(); + SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); + + // TODO: Should packed/unpacked D16 difference be reported here as part of + // the value mapping? + for (int I = 0; I != NumOps; ++I) { + if (!MI.getOperand(I).isReg()) + continue; + + Register OpReg = MI.getOperand(I).getReg(); + unsigned Size = getSizeInBits(OpReg, MRI, *TRI); + + // FIXME: Probably need a new intrinsic register bank searchable table to + // handle arbitrary intrinsics easily. + // + // If this has a sampler, it immediately follows rsrc. + const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; + + if (MustBeSGPR) { + // If this must be an SGPR, so we must report whatever it is as legal. + unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); + } else { + // Some operands must be VGPR, and these are easy to copy to. + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + } + } + + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); +} + +const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + SmallVector<const ValueMapping*, 2> OpdsMapping(2); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); - unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + Register PtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(PtrReg); + unsigned AS = PtrTy.getAddressSpace(); + unsigned PtrSize = PtrTy.getSizeInBits(); const ValueMapping *ValMapping; const ValueMapping *PtrMapping; - if (isInstrUniform(MI)) { + const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); + + if (PtrBank == &AMDGPU::SGPRRegBank && + (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && + AS != AMDGPUAS::PRIVATE_ADDRESS) && + isInstrUniformNonExtLoadAlign4(MI)) { // We have a uniform instruction so we want to use an SMRD load ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); - // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } @@ -1494,6 +2077,31 @@ AMDGPURegisterBankInfo::getRegBankID(Register Reg, return Bank ? Bank->getID() : Default; } + +static unsigned regBankUnion(unsigned RB0, unsigned RB1) { + return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; +} + +const RegisterBankInfo::ValueMapping * +AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + // Lie and claim anything is legal, even though this needs to be an SGPR + // applyMapping will have to deal with it as a waterfall loop. + unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID); + unsigned Size = getSizeInBits(Reg, MRI, TRI); + return AMDGPU::getValueMapping(Bank, Size); +} + +const RegisterBankInfo::ValueMapping * +AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + unsigned Size = getSizeInBits(Reg, MRI, TRI); + return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); +} + /// /// This function must return a legal mapping, because /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called @@ -1536,7 +2144,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { int ResultBank = -1; for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { - unsigned Reg = MI.getOperand(I).getReg(); + Register Reg = MI.getOperand(I).getReg(); const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); // FIXME: Assuming VGPR for any undetermined inputs. @@ -1660,7 +2268,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLVM_FALLTHROUGH; } - case AMDGPU::G_GEP: case AMDGPU::G_ADD: case AMDGPU::G_SUB: @@ -1669,15 +2276,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_LSHR: case AMDGPU::G_ASHR: case AMDGPU::G_UADDO: - case AMDGPU::G_SADDO: case AMDGPU::G_USUBO: - case AMDGPU::G_SSUBO: case AMDGPU::G_UADDE: case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_UMULH: - case AMDGPU::G_SMULH: case AMDGPU::G_SMIN: case AMDGPU::G_SMAX: case AMDGPU::G_UMIN: @@ -1692,17 +2295,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: case AMDGPU::G_FSQRT: + case AMDGPU::G_FFLOOR: + case AMDGPU::G_FCEIL: + case AMDGPU::G_FRINT: case AMDGPU::G_SITOFP: case AMDGPU::G_UITOFP: case AMDGPU::G_FPTRUNC: case AMDGPU::G_FPEXT: case AMDGPU::G_FEXP2: case AMDGPU::G_FLOG2: + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_AMDGPU_FFBH_U32: + return getDefaultMappingVOP(MI); + case AMDGPU::G_UMULH: + case AMDGPU::G_SMULH: { + if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) + return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + } case AMDGPU::G_IMPLICIT_DEF: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); @@ -1710,12 +2328,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_FCONSTANT: case AMDGPU::G_CONSTANT: - case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_GLOBAL_VALUE: case AMDGPU::G_BLOCK_ADDR: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case AMDGPU::G_FRAME_INDEX: { + // TODO: This should be the same as other constants, but eliminateFrameIndex + // currently assumes VALU uses. + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + break; + } case AMDGPU::G_INSERT: { unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -1737,8 +2362,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = nullptr; break; } - case AMDGPU::G_MERGE_VALUES: case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + if (DstTy == LLT::vector(2, 16)) { + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); + + OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); + OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); + break; + } + + LLVM_FALLTHROUGH; + } + case AMDGPU::G_MERGE_VALUES: case AMDGPU::G_CONCAT_VECTORS: { unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -1760,6 +2402,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_CTTZ_ZERO_UNDEF: case AMDGPU::G_CTPOP: case AMDGPU::G_BSWAP: + case AMDGPU::G_BITREVERSE: case AMDGPU::G_FABS: case AMDGPU::G_FNEG: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -1848,7 +2491,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { Op3Bank == AMDGPU::SGPRRegBankID && (Size == 32 || (Size == 64 && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && - MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64())); + Subtarget.hasScalarCompareEq64())); unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; @@ -1859,14 +2502,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned OutputBankID = isSALUMapping(MI) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + // VGPR index can be used for waterfall when indexing a SGPR vector. + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); // The index can be either if the source vector is VGPR. OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); @@ -1879,15 +2524,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), + MRI, *TRI); + unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, + InsertSize); // The index can be either if the source vector is VGPR. - OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { @@ -1903,11 +2551,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { default: return getInvalidInstructionMapping(); - case Intrinsic::maxnum: - case Intrinsic::minnum: case Intrinsic::amdgcn_div_fmas: case Intrinsic::amdgcn_trig_preop: case Intrinsic::amdgcn_sin: @@ -1938,6 +2584,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mbcnt_hi: case Intrinsic::amdgcn_ubfe: case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_mul_u24: + case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_lerp: case Intrinsic::amdgcn_sad_u8: case Intrinsic::amdgcn_msad_u8: @@ -1956,10 +2604,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_udot4: case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: - case Intrinsic::amdgcn_fdiv_fast: case Intrinsic::amdgcn_wwm: case Intrinsic::amdgcn_wqm: return getDefaultMappingVOP(MI); + case Intrinsic::amdgcn_ds_swizzle: case Intrinsic::amdgcn_ds_permute: case Intrinsic::amdgcn_ds_bpermute: case Intrinsic::amdgcn_update_dpp: @@ -2040,7 +2688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_readlane: { // This must be an SGPR, but accept a VGPR. - unsigned IdxReg = MI.getOperand(3).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); @@ -2055,10 +2703,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_writelane: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned SrcReg = MI.getOperand(2).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); - unsigned IdxReg = MI.getOperand(3).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); @@ -2081,9 +2729,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { - default: - return getInvalidInstructionMapping(); + auto IntrID = MI.getIntrinsicID(); + switch (IntrID) { case Intrinsic::amdgcn_s_getreg: case Intrinsic::amdgcn_s_memtime: case Intrinsic::amdgcn_s_memrealtime: @@ -2123,18 +2770,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; case Intrinsic::amdgcn_exp: - OpdsMapping[0] = nullptr; // IntrinsicID - // FIXME: These are immediate values which can't be read from registers. - OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); // FIXME: Could we support packed types here? OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); - // FIXME: These are immediate values which can't be read from registers. - OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; case Intrinsic::amdgcn_buffer_load: { Register RSrc = MI.getOperand(2).getReg(); // SGPR @@ -2169,11 +2809,97 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: { + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_init_exec: { + unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case Intrinsic::amdgcn_else: { + unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); + break; + } + case Intrinsic::amdgcn_kill: { + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_tbuffer_load: { + // FIXME: Should make intrinsic ID the last operand of the instruction, + // then this would be the same as store + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_tbuffer_load: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_init_exec_from_input: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_br: { + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); + break; + } + default: + if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = + AMDGPU::lookupRsrcIntrinsic(IntrID)) { + // Non-images can have complications from operands that allow both SGPR + // and VGPR. For now it's too complicated to figure out the final opcode + // to derive the register bank from the MCInstrDesc. + if (RSrcIntrin->IsImage) + return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); + } + + return getInvalidInstructionMapping(); } break; } @@ -2216,6 +2942,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_LOAD: + case AMDGPU::G_ZEXTLOAD: + case AMDGPU::G_SEXTLOAD: return getInstrMappingForLoad(MI); case AMDGPU::G_ATOMICRMW_XCHG: @@ -2228,6 +2956,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_MIN: case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: + case AMDGPU::G_ATOMICRMW_FADD: case AMDGPU::G_ATOMIC_CMPXCHG: { return getDefaultMappingAllVGPR(MI); } @@ -2247,4 +2976,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getOperandsMapping(OpdsMapping), MI.getNumOperands()); } - diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index f3a96e2a6128..a14b74961118 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -13,6 +13,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -23,7 +25,9 @@ namespace llvm { class LLT; +class GCNSubtarget; class MachineIRBuilder; +class SIInstrInfo; class SIRegisterInfo; class TargetRegisterInfo; @@ -36,9 +40,27 @@ protected: #include "AMDGPUGenRegisterBank.inc" }; class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { + const GCNSubtarget &Subtarget; const SIRegisterInfo *TRI; + const SIInstrInfo *TII; - void executeInWaterfallLoop(MachineInstr &MI, + bool collectWaterfallOperands( + SmallSet<Register, 4> &SGPROperandRegs, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const; + + bool executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs, + MachineRegisterInfo &MRI) const; + + bool executeInWaterfallLoop(MachineIRBuilder &B, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const; + bool executeInWaterfallLoop(MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const; @@ -47,6 +69,19 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { bool applyMappingWideLoad(MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const; + bool + applyMappingImage(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI, int RSrcIdx) const; + + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register Reg) const; + + std::pair<Register, unsigned> + splitBufferOffsets(MachineIRBuilder &B, Register Offset) const; + + MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B, + MachineInstr &MI) const; /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; @@ -58,6 +93,16 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const TargetRegisterInfo &TRI, unsigned Default = AMDGPU::VGPRRegBankID) const; + // Return a value mapping for an operand that is required to be an SGPR. + const ValueMapping *getSGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + // Return a value mapping for an operand that is required to be a VGPR. + const ValueMapping *getVGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p /// Regs. This appropriately sets the regbank of the new registers. void split64BitValueForMapping(MachineIRBuilder &B, @@ -90,8 +135,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingAllVGPR( const MachineInstr &MI) const; + + const InstructionMapping &getImageMapping(const MachineRegisterInfo &MRI, + const MachineInstr &MI, + int RsrcIdx) const; + public: - AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); + AMDGPURegisterBankInfo(const GCNSubtarget &STI); unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 9555694fb106..00f53b157577 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,14 +7,14 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512] + [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] + [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024] >; def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_64]>; +def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 7cffdf1a4dcf..9806e6b0714f 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -26,19 +26,59 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { - static const unsigned SubRegs[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, - AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, - AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, - AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24, - AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29, - AMDGPU::sub30, AMDGPU::sub31 - }; +// Table of NumRegs sized pieces at every 32-bit offset. +static const uint16_t SubRegFromChannelTable[][32] = { + { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31 + }, + { + AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4, + AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, + AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12, + AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16, + AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, + AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24, + AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28, + AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister + }, + { + AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5, + AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9, + AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13, + AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17, + AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21, + AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25, + AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29, + AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister + }, + { + AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6, + AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10, + AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14, + AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18, + AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22, + AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26, + AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30, + AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister + } +}; + +// FIXME: TableGen should generate something to make this manageable for all +// register classes. At a minimum we could use the opposite of +// composeSubRegIndices and go up from the base 32-bit subreg. +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) { + const unsigned NumRegIndex = NumRegs - 1; - assert(Channel < array_lengthof(SubRegs)); - return SubRegs[Channel]; + assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) && + "Not implemented"); + assert(Channel < array_lengthof(SubRegFromChannelTable[0])); + return SubRegFromChannelTable[NumRegIndex][Channel]; } void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 3453a8c1b0b3..9e713ca804a1 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -28,7 +28,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - static unsigned getSubRegFromChannel(unsigned Channel); + static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1); void reserveRegisterTuples(BitVector &, unsigned Reg) const; }; diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td index f8703c36127a..26b8b7840270 100644 --- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -81,6 +81,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; @@ -92,6 +94,8 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1eb9b83456c5..3bb6dd4571c0 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasFminFmaxLegacy(true), EnablePromoteAlloca(false), HasTrigReducedRange(false), + MaxWavesPerEU(10), LocalMemorySize(0), WavefrontSize(0) { } @@ -261,6 +262,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AddNoCarryInsts(false), HasUnpackedD16VMem(false), LDSMisalignedBug(false), + HasMFMAInlineLiteralBug(false), ScalarizeGlobal(false), @@ -278,9 +280,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); - RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); InstSelector.reset(new AMDGPUInstructionSelector( *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); } @@ -489,28 +492,28 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, - unsigned &MaxAlign) const { + Align &MaxAlign) const { assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL); const DataLayout &DL = F.getParent()->getDataLayout(); uint64_t ExplicitArgBytes = 0; - MaxAlign = 1; + MaxAlign = Align::None(); for (const Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(ArgTy); + const Align Alignment(DL.getABITypeAlignment(ArgTy)); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); - ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; - MaxAlign = std::max(MaxAlign, Align); + ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; + MaxAlign = std::max(MaxAlign, Alignment); } return ExplicitArgBytes; } unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, - unsigned &MaxAlign) const { + Align &MaxAlign) const { uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); unsigned ExplicitOffset = getExplicitKernelArgOffset(F); @@ -518,7 +521,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; unsigned ImplicitBytes = getImplicitArgNumBytes(F); if (ImplicitBytes != 0) { - unsigned Alignment = getAlignmentForImplicitArgPtr(); + const Align Alignment = getAlignmentForImplicitArgPtr(); TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; } @@ -566,7 +569,7 @@ bool GCNSubtarget::hasMadF16() const { unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 10; + return getMaxWavesPerEU(); if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) @@ -591,25 +594,12 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { } unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { - if (VGPRs <= 24) - return 10; - if (VGPRs <= 28) - return 9; - if (VGPRs <= 32) - return 8; - if (VGPRs <= 36) - return 7; - if (VGPRs <= 40) - return 6; - if (VGPRs <= 48) - return 5; - if (VGPRs <= 64) - return 4; - if (VGPRs <= 84) - return 3; - if (VGPRs <= 128) - return 2; - return 1; + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Granule = getVGPRAllocGranule(); + if (VGPRs < Granule) + return MaxWaves; + unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; + return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); } unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { @@ -629,6 +619,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } +unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, + unsigned LDSSize, + unsigned NumSGPRs, + unsigned NumVGPRs) const { + unsigned Occupancy = + std::min(getMaxWavesPerEU(), + getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); + if (NumSGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); + if (NumVGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); + return Occupancy; +} + unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -878,8 +882,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void GCNSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); - Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo)); + Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); + Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 78c3b823946d..936feb00c62b 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -75,6 +75,7 @@ protected: bool HasFminFmaxLegacy; bool EnablePromoteAlloca; bool HasTrigReducedRange; + unsigned MaxWavesPerEU; int LocalMemorySize; unsigned WavefrontSize; @@ -195,8 +196,8 @@ public: return LocalMemorySize; } - unsigned getAlignmentForImplicitArgPtr() const { - return isAmdHsaOS() ? 8 : 4; + Align getAlignmentForImplicitArgPtr() const { + return isAmdHsaOS() ? Align(8) : Align(4); } /// Returns the offset in bytes from the start of the input buffer @@ -223,7 +224,9 @@ public: /// subtarget. virtual unsigned getMinWavesPerEU() const = 0; - unsigned getMaxWavesPerEU() const { return 10; } + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget without any kind of limitation. + unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; @@ -235,16 +238,17 @@ public: return 16; return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); } - uint64_t getExplicitKernArgSize(const Function &F, - unsigned &MaxAlign) const; - unsigned getKernArgSegmentSize(const Function &F, - unsigned &MaxAlign) const; + uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; + unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; virtual ~AMDGPUSubtarget() {} }; class GCNSubtarget : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { + + using AMDGPUSubtarget::getMaxWavesPerEU; + public: enum TrapHandlerAbi { TrapHandlerAbiNone = 0, @@ -362,6 +366,7 @@ protected: bool CaymanISA; bool CFALUBug; bool LDSMisalignedBug; + bool HasMFMAInlineLiteralBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; @@ -416,7 +421,7 @@ public: return CallLoweringInfo.get(); } - const InstructionSelector *getInstructionSelector() const override { + InstructionSelector *getInstructionSelector() const override { return InstSelector.get(); } @@ -544,6 +549,14 @@ public: return GFX9Insts; } + bool hasScalarPackInsts() const { + return GFX9Insts; + } + + bool hasScalarMulHiInsts() const { + return GFX9Insts; + } + TrapHandlerAbi getTrapHandlerAbi() const { return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } @@ -611,6 +624,11 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } + /// \returns If target supports S_DENORM_MODE. + bool hasDenormModeInst() const { + return getGeneration() >= AMDGPUSubtarget::GFX10; + } + bool useFlatForGlobal() const { return FlatForGlobal; } @@ -848,9 +866,7 @@ public: // on the pointer value itself may rely on the alignment / known low bits of // the pointer. Set this to something above the minimum to avoid needing // dynamic realignment in common cases. - unsigned getStackAlignment() const { - return 16; - } + Align getStackAlignment() const { return Align(16); } bool enableMachineScheduler() const override { return true; @@ -881,12 +897,6 @@ public: return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); } - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget without any kind of limitation. - unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(); - } - /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { @@ -944,6 +954,14 @@ public: return HasDPP; } + bool hasDPPBroadcasts() const { + return HasDPP && getGeneration() < GFX10; + } + + bool hasDPPWavefrontShifts() const { + return HasDPP && getGeneration() < GFX10; + } + bool hasDPP8() const { return HasDPP8; } @@ -974,6 +992,10 @@ public: return SGPRInitBug; } + bool hasMFMAInlineLiteralBug() const { + return HasMFMAInlineLiteralBug; + } + bool has12DWordStoreHazard() const { return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } @@ -1036,6 +1058,13 @@ public: /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; + /// Return occupancy for the given function. Used LDS and a number of + /// registers if provided. + /// Note, occupancy can be affected by the scratch allocation as well, but + /// we do not have enough information to compute it. + unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, + unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { @@ -1226,9 +1255,7 @@ public: return Gen; } - unsigned getStackAlignment() const { - return 4; - } + Align getStackAlignment() const { return Align(4); } R600Subtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0ea8db04c298..e8cf77161a14 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -238,16 +238,17 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - return llvm::make_unique<AMDGPUTargetObjectFile>(); + return std::make_unique<AMDGPUTargetObjectFile>(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>()); + return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>()); } static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { @@ -257,7 +258,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = - new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); + new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); @@ -412,6 +413,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { PM.add(createAMDGPUExternalAAWrapperPass()); } PM.add(createAMDGPUUnifyMetadataPass()); + PM.add(createAMDGPUPrintfRuntimeBinding()); PM.add(createAMDGPUPropagateAttributesLatePass(this)); if (Internalize) { PM.add(createInternalizePass(mustPreserveGV)); @@ -482,7 +484,7 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); + I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); } return I.get(); @@ -518,7 +520,7 @@ const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); + I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); @@ -659,6 +661,8 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAMDGPUPrintfRuntimeBinding()); + // This must occur before inlining, as the inliner will not look through // bitcast calls. addPass(createAMDGPUFixFunctionBitcastsPass()); @@ -681,12 +685,6 @@ void AMDGPUPassConfig::addIRPasses() { // without ever running any passes on the second. addPass(createBarrierNoopPass()); - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { - // TODO: May want to move later or split into an early and late one. - - addPass(createAMDGPUCodeGenPreparePass()); - } - // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. if (TM.getTargetTriple().getArch() == Triple::r600) addPass(createR600OpenCLImageTypeLoweringPass()); @@ -714,6 +712,11 @@ void AMDGPUPassConfig::addIRPasses() { } } + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass()); + } + TargetPassConfig::addIRPasses(); // EarlyCSE is not always strong enough to clean up what LSR produces. For @@ -1046,7 +1049,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return true; if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && - !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) { + !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } @@ -1095,7 +1098,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( if (YamlMFI.ArgInfo && (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, - AMDGPU::SReg_128RegClass, + AMDGPU::SGPR_128RegClass, MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index aaed280a1270..616196ad5ba3 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -57,7 +57,7 @@ using namespace llvm; static cl::opt<unsigned> UnrollThresholdPrivate( "amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), - cl::init(2500), cl::Hidden); + cl::init(2000), cl::Hidden); static cl::opt<unsigned> UnrollThresholdLocal( "amdgpu-unroll-threshold-local", @@ -590,6 +590,61 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { return false; } +bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, + Intrinsic::ID IID) const { + switch (IID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + OpIndexes.push_back(0); + return true; + default: + return false; + } +} + +bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( + IntrinsicInst *II, Value *OldV, Value *NewV) const { + auto IntrID = II->getIntrinsicID(); + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { + const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); + if (!IsVolatile->isZero()) + return false; + Module *M = II->getParent()->getParent()->getParent(); + Type *DestTy = II->getType(); + Type *SrcTy = NewV->getType(); + Function *NewDecl = + Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); + II->setArgOperand(0, NewV); + II->setCalledFunction(NewDecl); + return true; + } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? + AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; + unsigned NewAS = NewV->getType()->getPointerAddressSpace(); + LLVMContext &Ctx = NewV->getType()->getContext(); + ConstantInt *NewVal = (TrueAS == NewAS) ? + ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); + II->replaceAllUsesWith(NewVal); + II->eraseFromParent(); + return true; + } + default: + return false; + } +} + unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (ST->hasVOP3PInsts()) { @@ -638,6 +693,39 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, CommonTTI.getUnrollingPreferences(L, SE, UP); } +unsigned GCNTTIImpl::getUserCost(const User *U, + ArrayRef<const Value *> Operands) { + // Estimate extractelement elimination + if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) { + ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1)); + unsigned Idx = -1; + if (CI) + Idx = CI->getZExtValue(); + return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(), + Idx); + } + + // Estimate insertelement elimination + if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) { + ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); + unsigned Idx = -1; + if (CI) + Idx = CI->getZExtValue(); + return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx); + } + + // Estimate different intrinsics, e.g. llvm.fabs + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { + SmallVector<Value *, 4> Args(II->arg_operands()); + FastMathFlags FMF; + if (auto *FPMO = dyn_cast<FPMathOperator>(II)) + FMF = FPMO->getFastMathFlags(); + return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, + FMF); + } + return BaseT::getUserCost(U, Operands); +} + unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 6f1bf5a26f0d..67f7f9074f10 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -46,10 +46,18 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { Triple TargetTriple; + const TargetSubtargetInfo *ST; + const TargetLoweringBase *TLI; + + const TargetSubtargetInfo *getST() const { return ST; } + const TargetLoweringBase *getTLI() const { return TLI; } + public: explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) - : BaseT(TM, F.getParent()->getDataLayout()), - TargetTriple(TM->getTargetTriple()) {} + : BaseT(TM, F.getParent()->getDataLayout()), + TargetTriple(TM->getTargetTriple()), + ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()) {} void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); @@ -183,6 +191,11 @@ public: return AMDGPUAS::FLAT_ADDRESS; } + bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, + Intrinsic::ID IID) const; + bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, + Value *OldV, Value *NewV) const; + unsigned getVectorSplitCost() { return 0; } unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -191,7 +204,7 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 7; } + unsigned getInliningThresholdMultiplier() { return 9; } int getInlinerVectorBonusPercent() { return 0; } @@ -201,6 +214,7 @@ public: int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned); + unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); }; class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 12f2e9519c9e..101ecfc0c87c 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1307,8 +1307,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, if (LandBlkHasOtherPred) { report_fatal_error("Extra register needed to handle CFG"); - unsigned CmpResReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + Register CmpResReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); report_fatal_error("Extra compare instruction needed to handle CFG"); insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, CmpResReg, DebugLoc()); @@ -1316,8 +1316,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // XXX: We are running this after RA, so creating virtual registers will // cause an assertion failure in the PostRA scheduling pass. - unsigned InitReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + Register InitReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg, DebugLoc()); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 6d678966c98e..9dd511fab57c 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -143,6 +143,7 @@ public: ImmTyDLC, ImmTyGLC, ImmTySLC, + ImmTySWZ, ImmTyTFE, ImmTyD16, ImmTyClampSI, @@ -216,14 +217,15 @@ public: if (Kind == Token) return true; - if (Kind != Expression || !Expr) - return false; - // When parsing operands, we can't always tell if something was meant to be // a token, like 'gds', or an expression that references a global variable. // In this case, we assume the string is an expression, and if we need to // interpret is a token, then we treat the symbol name as the token. - return isa<MCSymbolRefExpr>(Expr); + return isSymbolRefExpr(); + } + + bool isSymbolRefExpr() const { + return isExpr() && Expr && isa<MCSymbolRefExpr>(Expr); } bool isImm() const override { @@ -274,8 +276,10 @@ public: isRegClass(AMDGPU::VReg_64RegClassID) || isRegClass(AMDGPU::VReg_96RegClassID) || isRegClass(AMDGPU::VReg_128RegClassID) || + isRegClass(AMDGPU::VReg_160RegClassID) || isRegClass(AMDGPU::VReg_256RegClassID) || - isRegClass(AMDGPU::VReg_512RegClassID); + isRegClass(AMDGPU::VReg_512RegClassID) || + isRegClass(AMDGPU::VReg_1024RegClassID); } bool isVReg32() const { @@ -286,6 +290,10 @@ public: return isOff() || isVReg32(); } + bool isNull() const { + return isRegKind() && getReg() == AMDGPU::SGPR_NULL; + } + bool isSDWAOperand(MVT type) const; bool isSDWAFP16Operand() const; bool isSDWAFP32Operand() const; @@ -325,6 +333,7 @@ public: bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } + bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } @@ -817,6 +826,7 @@ public: case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; + case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -886,7 +896,7 @@ public: int64_t Val, SMLoc Loc, ImmTy Type = ImmTyNone, bool IsFPImm = false) { - auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser); + auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser); Op->Imm.Val = Val; Op->Imm.IsFPImm = IsFPImm; Op->Imm.Type = Type; @@ -899,7 +909,7 @@ public: static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser, StringRef Str, SMLoc Loc, bool HasExplicitEncodingSize = true) { - auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser); + auto Res = std::make_unique<AMDGPUOperand>(Token, AsmParser); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); Res->StartLoc = Loc; @@ -910,7 +920,7 @@ public: static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser, unsigned RegNo, SMLoc S, SMLoc E) { - auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser); + auto Op = std::make_unique<AMDGPUOperand>(Register, AsmParser); Op->Reg.RegNo = RegNo; Op->Reg.Mods = Modifiers(); Op->StartLoc = S; @@ -920,7 +930,7 @@ public: static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser, const class MCExpr *Expr, SMLoc S) { - auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser); + auto Op = std::make_unique<AMDGPUOperand>(Expression, AsmParser); Op->Expr = Expr; Op->StartLoc = S; Op->EndLoc = S; @@ -1051,11 +1061,23 @@ private: std::string &CollectString); bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, - RegisterKind RegKind, unsigned Reg1, - unsigned RegNum); + RegisterKind RegKind, unsigned Reg1); bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, - unsigned& RegNum, unsigned& RegWidth, - unsigned *DwordRegIndex); + unsigned& RegNum, unsigned& RegWidth); + unsigned ParseRegularReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth); + unsigned ParseSpecialReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth); + unsigned ParseRegList(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth); + bool ParseRegRange(unsigned& Num, unsigned& Width); + unsigned getRegularReg(RegisterKind RegKind, + unsigned RegNum, + unsigned RegWidth); + bool isRegister(); bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const; Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind); @@ -1306,6 +1328,7 @@ private: bool validateOpSel(const MCInst &Inst); bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst) const; + unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; @@ -1321,6 +1344,7 @@ private: void peekTokens(MutableArrayRef<AsmToken> Tokens); AsmToken::TokenKind getTokenKind() const; bool parseExpr(int64_t &Imm); + bool parseExpr(OperandVector &Operands); StringRef getTokenStr() const; AsmToken peekToken(); AsmToken getToken() const; @@ -1399,9 +1423,12 @@ public: void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType, bool skipVcc = false); + uint64_t BasicInstType, + bool SkipDstVcc = false, + bool SkipSrcVcc = false); AMDGPUOperand::Ptr defaultBLGP() const; AMDGPUOperand::Ptr defaultCBSZ() const; @@ -1636,8 +1663,8 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { } bool AMDGPUOperand::isBoolReg() const { - return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? - isSCSrcB64() : isSCSrcB32(); + return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) || + (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32()); } uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const @@ -1849,6 +1876,8 @@ static bool isInlineValue(unsigned Reg) { case AMDGPU::SRC_EXECZ: case AMDGPU::SRC_SCC: return true; + case AMDGPU::SGPR_NULL: + return true; default: return false; } @@ -1870,8 +1899,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 2: return AMDGPU::VReg_64RegClassID; case 3: return AMDGPU::VReg_96RegClassID; case 4: return AMDGPU::VReg_128RegClassID; + case 5: return AMDGPU::VReg_160RegClassID; case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; + case 32: return AMDGPU::VReg_1024RegClassID; } } else if (Is == IS_TTMP) { switch (RegWidth) { @@ -1944,7 +1975,7 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("tba_lo", AMDGPU::TBA_LO) .Case("tba_hi", AMDGPU::TBA_HI) .Case("null", AMDGPU::SGPR_NULL) - .Default(0); + .Default(AMDGPU::NoRegister); } bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, @@ -1959,8 +1990,7 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, } bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, - RegisterKind RegKind, unsigned Reg1, - unsigned RegNum) { + RegisterKind RegKind, unsigned Reg1) { switch (RegKind) { case IS_SPECIAL: if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { @@ -2008,14 +2038,37 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, } } -static const StringRef Registers[] = { - { "v" }, - { "s" }, - { "ttmp" }, - { "acc" }, - { "a" }, +struct RegInfo { + StringLiteral Name; + RegisterKind Kind; +}; + +static constexpr RegInfo RegularRegisters[] = { + {{"v"}, IS_VGPR}, + {{"s"}, IS_SGPR}, + {{"ttmp"}, IS_TTMP}, + {{"acc"}, IS_AGPR}, + {{"a"}, IS_AGPR}, }; +static bool isRegularReg(RegisterKind Kind) { + return Kind == IS_VGPR || + Kind == IS_SGPR || + Kind == IS_TTMP || + Kind == IS_AGPR; +} + +static const RegInfo* getRegularRegInfo(StringRef Str) { + for (const RegInfo &Reg : RegularRegisters) + if (Str.startswith(Reg.Name)) + return &Reg; + return nullptr; +} + +static bool getRegNum(StringRef Str, unsigned& Num) { + return !Str.getAsInteger(10, Num); +} + bool AMDGPUAsmParser::isRegister(const AsmToken &Token, const AsmToken &NextToken) const { @@ -2029,24 +2082,24 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token, // A single register like s0 or a range of registers like s[0:1] - StringRef RegName = Token.getString(); - - for (StringRef Reg : Registers) { - if (RegName.startswith(Reg)) { - if (Reg.size() < RegName.size()) { - unsigned RegNum; - // A single register with an index: rXX - if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum)) - return true; - } else { - // A range of registers: r[XX:YY]. - if (NextToken.is(AsmToken::LBrac)) - return true; - } + StringRef Str = Token.getString(); + const RegInfo *Reg = getRegularRegInfo(Str); + if (Reg) { + StringRef RegName = Reg->Name; + StringRef RegSuffix = Str.substr(RegName.size()); + if (!RegSuffix.empty()) { + unsigned Num; + // A single register with an index: rXX + if (getRegNum(RegSuffix, Num)) + return true; + } else { + // A range of registers: r[XX:YY]. + if (NextToken.is(AsmToken::LBrac)) + return true; } } - return getSpecialRegForName(RegName); + return getSpecialRegForName(Str) != AMDGPU::NoRegister; } bool @@ -2055,137 +2108,161 @@ AMDGPUAsmParser::isRegister() return isRegister(getToken(), peekToken()); } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, - unsigned &RegNum, unsigned &RegWidth, - unsigned *DwordRegIndex) { - if (DwordRegIndex) { *DwordRegIndex = 0; } +unsigned +AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, + unsigned RegNum, + unsigned RegWidth) { + + assert(isRegularReg(RegKind)); + + unsigned AlignSize = 1; + if (RegKind == IS_SGPR || RegKind == IS_TTMP) { + // SGPR and TTMP registers must be aligned. + // Max required alignment is 4 dwords. + AlignSize = std::min(RegWidth, 4u); + } + + if (RegNum % AlignSize != 0) + return AMDGPU::NoRegister; + + unsigned RegIdx = RegNum / AlignSize; + int RCID = getRegClass(RegKind, RegWidth); + if (RCID == -1) + return AMDGPU::NoRegister; + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (getLexer().is(AsmToken::Identifier)) { - StringRef RegName = Parser.getTok().getString(); - if ((Reg = getSpecialRegForName(RegName))) { - Parser.Lex(); - RegKind = IS_SPECIAL; - } else { - unsigned RegNumIndex = 0; - if (RegName[0] == 'v') { - RegNumIndex = 1; - RegKind = IS_VGPR; - } else if (RegName[0] == 's') { - RegNumIndex = 1; - RegKind = IS_SGPR; - } else if (RegName[0] == 'a') { - RegNumIndex = RegName.startswith("acc") ? 3 : 1; - RegKind = IS_AGPR; - } else if (RegName.startswith("ttmp")) { - RegNumIndex = strlen("ttmp"); - RegKind = IS_TTMP; - } else { - return false; - } - if (RegName.size() > RegNumIndex) { - // Single 32-bit register: vXX. - if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum)) - return false; - Parser.Lex(); - RegWidth = 1; - } else { - // Range of registers: v[XX:YY]. ":YY" is optional. - Parser.Lex(); - int64_t RegLo, RegHi; - if (getLexer().isNot(AsmToken::LBrac)) - return false; - Parser.Lex(); + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegIdx >= RC.getNumRegs()) + return AMDGPU::NoRegister; - if (getParser().parseAbsoluteExpression(RegLo)) - return false; + return RC.getRegister(RegIdx); +} - const bool isRBrace = getLexer().is(AsmToken::RBrac); - if (!isRBrace && getLexer().isNot(AsmToken::Colon)) - return false; - Parser.Lex(); +bool +AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) { + int64_t RegLo, RegHi; + if (!trySkipToken(AsmToken::LBrac)) + return false; - if (isRBrace) { - RegHi = RegLo; - } else { - if (getParser().parseAbsoluteExpression(RegHi)) - return false; + if (!parseExpr(RegLo)) + return false; - if (getLexer().isNot(AsmToken::RBrac)) - return false; - Parser.Lex(); - } - RegNum = (unsigned) RegLo; - RegWidth = (RegHi - RegLo) + 1; - } - } - } else if (getLexer().is(AsmToken::LBrac)) { - // List of consecutive registers: [s0,s1,s2,s3] - Parser.Lex(); - if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr)) + if (trySkipToken(AsmToken::Colon)) { + if (!parseExpr(RegHi)) return false; - if (RegWidth != 1) - return false; - RegisterKind RegKind1; - unsigned Reg1, RegNum1, RegWidth1; - do { - if (getLexer().is(AsmToken::Comma)) { - Parser.Lex(); - } else if (getLexer().is(AsmToken::RBrac)) { - Parser.Lex(); - break; - } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) { - if (RegWidth1 != 1) { - return false; - } - if (RegKind1 != RegKind) { - return false; - } - if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) { - return false; - } - } else { - return false; - } - } while (true); } else { - return false; + RegHi = RegLo; } - switch (RegKind) { - case IS_SPECIAL: + + if (!trySkipToken(AsmToken::RBrac)) + return false; + + if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi) + return false; + + Num = static_cast<unsigned>(RegLo); + Width = (RegHi - RegLo) + 1; + return true; +} + +unsigned +AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth) { + assert(isToken(AsmToken::Identifier)); + unsigned Reg = getSpecialRegForName(getTokenStr()); + if (Reg) { RegNum = 0; RegWidth = 1; - break; - case IS_VGPR: - case IS_SGPR: - case IS_AGPR: - case IS_TTMP: - { - unsigned Size = 1; - if (RegKind == IS_SGPR || RegKind == IS_TTMP) { - // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords. - Size = std::min(RegWidth, 4u); - } - if (RegNum % Size != 0) - return false; - if (DwordRegIndex) { *DwordRegIndex = RegNum; } - RegNum = RegNum / Size; - int RCID = getRegClass(RegKind, RegWidth); - if (RCID == -1) - return false; - const MCRegisterClass RC = TRI->getRegClass(RCID); - if (RegNum >= RC.getNumRegs()) - return false; - Reg = RC.getRegister(RegNum); - break; + RegKind = IS_SPECIAL; + lex(); // skip register name } + return Reg; +} - default: - llvm_unreachable("unexpected register kind"); +unsigned +AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth) { + assert(isToken(AsmToken::Identifier)); + StringRef RegName = getTokenStr(); + + const RegInfo *RI = getRegularRegInfo(RegName); + if (!RI) + return AMDGPU::NoRegister; + lex(); // skip register name + + RegKind = RI->Kind; + StringRef RegSuffix = RegName.substr(RI->Name.size()); + if (!RegSuffix.empty()) { + // Single 32-bit register: vXX. + if (!getRegNum(RegSuffix, RegNum)) + return AMDGPU::NoRegister; + RegWidth = 1; + } else { + // Range of registers: v[XX:YY]. ":YY" is optional. + if (!ParseRegRange(RegNum, RegWidth)) + return AMDGPU::NoRegister; } - if (!subtargetHasRegister(*TRI, Reg)) - return false; - return true; + return getRegularReg(RegKind, RegNum, RegWidth); +} + +unsigned +AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth) { + unsigned Reg = AMDGPU::NoRegister; + + if (!trySkipToken(AsmToken::LBrac)) + return AMDGPU::NoRegister; + + // List of consecutive registers, e.g.: [s0,s1,s2,s3] + + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) + return AMDGPU::NoRegister; + if (RegWidth != 1) + return AMDGPU::NoRegister; + + for (; trySkipToken(AsmToken::Comma); ) { + RegisterKind NextRegKind; + unsigned NextReg, NextRegNum, NextRegWidth; + + if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth)) + return AMDGPU::NoRegister; + if (NextRegWidth != 1) + return AMDGPU::NoRegister; + if (NextRegKind != RegKind) + return AMDGPU::NoRegister; + if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg)) + return AMDGPU::NoRegister; + } + + if (!trySkipToken(AsmToken::RBrac)) + return AMDGPU::NoRegister; + + if (isRegularReg(RegKind)) + Reg = getRegularReg(RegKind, RegNum, RegWidth); + + return Reg; +} + +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, + unsigned &Reg, + unsigned &RegNum, + unsigned &RegWidth) { + Reg = AMDGPU::NoRegister; + + if (isToken(AsmToken::Identifier)) { + Reg = ParseSpecialReg(RegKind, RegNum, RegWidth); + if (Reg == AMDGPU::NoRegister) + Reg = ParseRegularReg(RegKind, RegNum, RegWidth); + } else { + Reg = ParseRegList(RegKind, RegNum, RegWidth); + } + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg); } Optional<StringRef> @@ -2241,18 +2318,18 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { SMLoc StartLoc = Tok.getLoc(); SMLoc EndLoc = Tok.getEndLoc(); RegisterKind RegKind; - unsigned Reg, RegNum, RegWidth, DwordRegIndex; + unsigned Reg, RegNum, RegWidth; - if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) { + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { //FIXME: improve error messages (bug 41303). Error(StartLoc, "not a valid operand."); return nullptr; } if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { - if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth)) + if (!updateGprCountSymbols(RegKind, RegNum, RegWidth)) return nullptr; } else - KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); + KernelScope.usesRegister(RegKind, RegNum, RegWidth); return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc); } @@ -2648,7 +2725,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { case AMDGPU::VCC_LO: case AMDGPU::VCC_HI: case AMDGPU::M0: - case AMDGPU::SGPR_NULL: return Reg; default: break; @@ -2697,13 +2773,38 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, } } +unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const { + if (!isGFX10()) + return 1; + + switch (Opcode) { + // 64-bit shift instructions can use only one scalar value input + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHL_B64: + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHR_B64: + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHR_I64: + return 1; + default: + return 2; + } +} + bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { const MCOperand &MO = Inst.getOperand(OpIdx); if (MO.isImm()) { return !isInlineConstant(Inst, OpIdx); + } else if (MO.isReg()) { + auto Reg = MO.getReg(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL; + } else { + return true; } - return !MO.isReg() || - isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); } bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { @@ -2782,10 +2883,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { } ConstantBusUseCount += NumLiterals; - if (isGFX10()) - return ConstantBusUseCount <= 2; - - return ConstantBusUseCount <= 1; + return ConstantBusUseCount <= getConstantBusLimit(Opcode); } bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) { @@ -3212,6 +3310,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { const int OpIndices[] = { Src0Idx, Src1Idx }; + unsigned NumExprs = 0; unsigned NumLiterals = 0; uint32_t LiteralValue; @@ -3219,19 +3318,21 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { if (OpIdx == -1) break; const MCOperand &MO = Inst.getOperand(OpIdx); - if (MO.isImm() && - // Exclude special imm operands (like that used by s_set_gpr_idx_on) - AMDGPU::isSISrcOperand(Desc, OpIdx) && - !isInlineConstant(Inst, OpIdx)) { - uint32_t Value = static_cast<uint32_t>(MO.getImm()); - if (NumLiterals == 0 || LiteralValue != Value) { - LiteralValue = Value; - ++NumLiterals; + // Exclude special imm operands (like that used by s_set_gpr_idx_on) + if (AMDGPU::isSISrcOperand(Desc, OpIdx)) { + if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { + uint32_t Value = static_cast<uint32_t>(MO.getImm()); + if (NumLiterals == 0 || LiteralValue != Value) { + LiteralValue = Value; + ++NumLiterals; + } + } else if (MO.isExpr()) { + ++NumExprs; } } } - return NumLiterals <= 1; + return NumLiterals + NumExprs <= 1; } bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { @@ -3267,6 +3368,7 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + unsigned NumExprs = 0; unsigned NumLiterals = 0; uint32_t LiteralValue; @@ -3274,17 +3376,26 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { if (OpIdx == -1) break; const MCOperand &MO = Inst.getOperand(OpIdx); - if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx)) + if (!MO.isImm() && !MO.isExpr()) + continue; + if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) continue; - if (!isInlineConstant(Inst, OpIdx)) { + if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) && + getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug]) + return false; + + if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { uint32_t Value = static_cast<uint32_t>(MO.getImm()); if (NumLiterals == 0 || LiteralValue != Value) { LiteralValue = Value; ++NumLiterals; } + } else if (MO.isExpr()) { + ++NumExprs; } } + NumLiterals += NumExprs; return !NumLiterals || (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]); @@ -3607,37 +3718,44 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); - UserSGPRCount += 4; + if (Val) + UserSGPRCount += 4; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, Val, ValRange); - UserSGPRCount += 1; + if (Val) + UserSGPRCount += 1; } else if (ID == ".amdhsa_wavefront_size32") { if (IVersion.Major < 10) return getParser().Error(IDRange.Start, "directive requires gfx10+", @@ -5225,6 +5343,23 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) { } bool +AMDGPUAsmParser::parseExpr(OperandVector &Operands) { + SMLoc S = getLoc(); + + const MCExpr *Expr; + if (Parser.parseExpression(Expr)) + return false; + + int64_t IntVal; + if (Expr->evaluateAsAbsolute(IntVal)) { + Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + } else { + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); + } + return true; +} + +bool AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { if (isToken(AsmToken::String)) { Val = getToken().getStringContents(); @@ -5605,25 +5740,29 @@ bool AMDGPUOperand::isGPRIdxMode() const { OperandMatchResultTy AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - switch (getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: { - int64_t Imm; - if (getParser().parseAbsoluteExpression(Imm)) - return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S)); - return MatchOperand_Success; - } + // Make sure we are not parsing something + // that looks like a label or an expression but is not. + // This will improve error messages. + if (isRegister() || isModifier()) + return MatchOperand_NoMatch; - case AsmToken::Identifier: - Operands.push_back(AMDGPUOperand::CreateExpr(this, - MCSymbolRefExpr::create(getContext().getOrCreateSymbol( - Parser.getTok().getString()), getContext()), S)); - Parser.Lex(); - return MatchOperand_Success; + if (parseExpr(Operands)) { + + AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]); + assert(Opr.isImm() || Opr.isExpr()); + SMLoc Loc = Opr.getStartLoc(); + + // Currently we do not support arbitrary expressions as branch targets. + // Only labels and absolute expressions are accepted. + if (Opr.isExpr() && !Opr.isSymbolRefExpr()) { + Error(Loc, "expected an absolute expression or a label"); + } else if (Opr.isImm() && !Opr.isS16Imm()) { + Error(Loc, "expected a 16-bit signed jump offset"); + } } + + return MatchOperand_Success; // avoid excessive error messages } //===----------------------------------------------------------------------===// @@ -5908,6 +6047,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, @@ -5941,8 +6081,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { - unsigned size = Operands.size(); - assert(size > 0); OperandMatchResultTy res = parseOptionalOpr(Operands); @@ -5957,17 +6095,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan // to make sure autogenerated parser of custom operands never hit hardcoded // mandatory operands. - if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) { - - // We have parsed the first optional operand. - // Parse as many operands as necessary to skip all mandatory operands. + for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) { + if (res != MatchOperand_Success || + isToken(AsmToken::EndOfStatement)) + break; - for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) { - if (res != MatchOperand_Success || - getLexer().is(AsmToken::EndOfStatement)) break; - if (getLexer().is(AsmToken::Comma)) Parser.Lex(); - res = parseOptionalOpr(Operands); - } + trySkipToken(AsmToken::Comma); + res = parseOptionalOpr(Operands); } return res; @@ -6682,7 +6816,11 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { } void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { - cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true, true); +} + +void AMDGPUAsmParser::cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, false, true); } void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { @@ -6690,11 +6828,14 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { } void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType, bool skipVcc) { + uint64_t BasicInstType, + bool SkipDstVcc, + bool SkipSrcVcc) { using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; - bool skippedVcc = false; + bool SkipVcc = SkipDstVcc || SkipSrcVcc; + bool SkippedVcc = false; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -6704,19 +6845,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (skipVcc && !skippedVcc && Op.isReg() && + if (SkipVcc && !SkippedVcc && Op.isReg() && (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) { // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. // Skip VCC only if we didn't skip it on previous iteration. + // Note that src0 and src1 occupy 2 slots each because of modifiers. if (BasicInstType == SIInstrFlags::VOP2 && - (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { - skippedVcc = true; + ((SkipDstVcc && Inst.getNumOperands() == 1) || + (SkipSrcVcc && Inst.getNumOperands() == 5))) { + SkippedVcc = true; continue; } else if (BasicInstType == SIInstrFlags::VOPC && Inst.getNumOperands() == 0) { - skippedVcc = true; + SkippedVcc = true; continue; } } @@ -6728,7 +6871,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } else { llvm_unreachable("Invalid operand type"); } - skippedVcc = false; + SkippedVcc = false; } if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 && @@ -6849,6 +6992,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand; case MCK_AttrChan: return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand; + case MCK_SReg_64: + case MCK_SReg_64_XEXEC: + // Null is defined as a 32-bit register but + // it should also be enabled with 64-bit operands. + // The following code enables it for SReg_64 operands + // used as source and destination. Remaining source + // operands are handled in isInlinableImm. + return Operand.isNull() ? Match_Success : Match_InvalidOperand; default: return Match_InvalidOperand; } diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 62a19d848af2..1b12550aed88 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; -def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">; +def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; @@ -54,6 +54,17 @@ class MTBUFAddr64Table <bit is_addr64, string Name> { // MTBUF classes //===----------------------------------------------------------------------===// +class MTBUFGetBaseOpcode<string Op> { + string ret = !subst("FORMAT_XY", "FORMAT_X", + !subst("FORMAT_XYZ", "FORMAT_X", + !subst("FORMAT_XYZW", "FORMAT_X", Op))); +} + +class getMTBUFElements<string Op> { + int ret = 1; +} + + class MTBUF_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : InstSI<outs, ins, "", pattern>, @@ -67,6 +78,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, string Mnemonic = opName; string AsmOperands = asmOps; + Instruction Opcode = !cast<Instruction>(NAME); + Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; @@ -90,6 +104,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> elements = 0; } class MTBUF_Real <MTBUF_Pseudo ps> : @@ -126,17 +141,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc), + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc) + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -181,51 +196,54 @@ class MTBUF_SetupAddr<int addrKind> { class MTBUF_Load_Pseudo <string opName, int addrKind, RegisterClass vdataClass, + int elems, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo<opName, (outs vdataClass:$vdata), getMTBUFIns<addrKindCopy>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 1; let mayStore = 0; + let elements = elems; } multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, - ValueType load_vt = i32, + int elems, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { - def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, - i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, - i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>; + def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; } } class MTBUF_Store_Pseudo <string opName, int addrKind, RegisterClass vdataClass, + int elems, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo <string opName, : MTBUF_Pseudo<opName, (outs), getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; + let elements = elems; } multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, - ValueType store_vt = i32, + int elems, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe, i1:$dlc))]>, + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe, i1:$dlc))]>, + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>; + def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; } } @@ -320,7 +339,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; - bits<4> dwords = 0; + bits<4> elements = 0; } class MUBUF_Real <MUBUF_Pseudo ps> : @@ -393,18 +412,30 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) + !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) ); } -class getMUBUFDwords<RegisterClass regClass> { - string regClassAsInt = !cast<string>(regClass); +class getMUBUFElements<ValueType vt> { + // eq does not support ValueType for some reason. + string vtAsStr = !cast<string>(vt); + int ret = - !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1, - !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2, - !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3, - !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4, - 0)))); + !if(!eq(vtAsStr, "f16"), 1, + !if(!eq(vtAsStr, "v2f16"), 2, + !if(!eq(vtAsStr, "v3f16"), 3, + !if(!eq(vtAsStr, "v4f16"), 4, + !if(!eq(vt.Size, 32), 1, + !if(!eq(vt.Size, 64), 2, + !if(!eq(vt.Size, 96), 3, + !if(!eq(vt.Size, 128), 4, 0) + ) + ) + ) + ) + ) + ) + ); } class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> { @@ -442,18 +473,18 @@ class MUBUF_SetupAddr<int addrKind> { class MUBUF_Load_Pseudo <string opName, int addrKind, - RegisterClass vdataClass, + ValueType vdata_vt, bit HasTiedDest = 0, bit isLds = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, - (outs vdataClass:$vdata), + (outs getVregSrcForVT<vdata_vt>.ret:$vdata), !con(getMUBUFIns<addrKindCopy, [], isLds>.ret, - !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), + !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc", + !if(isLds, " lds", "$tfe") # "$dlc" # "$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -467,19 +498,19 @@ class MUBUF_Load_Pseudo <string opName, let Uses = !if(isLds, [EXEC, M0], [EXEC]); let has_tfe = !if(isLds, 0, 1); let lds = isLds; - let dwords = getMUBUFDwords<vdataClass>.ret; + let elements = getMUBUFElements<vdata_vt>.ret; } class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; class MUBUF_Addr64_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -490,89 +521,87 @@ multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPa // FIXME: tfe can't be an operand because it requires a separate // opcode because it needs an N+1 register class dest register. -multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, +multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32, SDPatternOperator ld = null_frag, bit TiedDest = 0, bit isLds = 0> { - def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>, + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>, MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; - def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>, + def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>, MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; - def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; - def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>; - def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>; - def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; - def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>; - def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>; + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>; } } -multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass, - ValueType load_vt = i32, +multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, SDPatternOperator ld_nolds = null_frag, SDPatternOperator ld_lds = null_frag> { - defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>; - defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>; + defm NAME : MUBUF_Pseudo_Loads<opName, load_vt, ld_nolds>; + defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, ld_lds, 0, 1>; } class MUBUF_Store_Pseudo <string opName, int addrKind, - RegisterClass vdataClass, + ValueType store_vt, list<dag> pattern=[], // Workaround bug bz30254 - int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass> + int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs), - getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", + getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret, + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; - let dwords = getMUBUFDwords<vdataClass>.ret; + let elements = getMUBUFElements<store_vt>.ret; } -multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, +multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<0, NAME>; - def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<1, NAME>; - def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>; + def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>; + def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>; + def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>; + def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>; + def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>; } } class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), - (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), - " $srsrc, $soffset$offset lds$glc$slc"> { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz), + " $srsrc, $soffset$offset lds$glc$slc$swz"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -686,7 +715,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, SDPatternOperator atomic, - bit isFP = getIsFP<vdataType>.ret> { + bit isFP = isFloatType<vdataType>.ret> { let FPAtomic = isFP in def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, MUBUFAddr64Table <0, NAME>; @@ -710,7 +739,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, SDPatternOperator atomic, - bit isFP = getIsFP<vdataType>.ret> { + bit isFP = isFloatType<vdataType>.ret> { let FPAtomic = isFP in def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set vdataType:$vdata, @@ -748,107 +777,107 @@ multiclass MUBUF_Pseudo_Atomics <string opName, //===----------------------------------------------------------------------===// defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds < - "buffer_load_format_x", VGPR_32 + "buffer_load_format_x", f32 >; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads < - "buffer_load_format_xy", VReg_64 + "buffer_load_format_xy", v2f32 >; defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads < - "buffer_load_format_xyz", VReg_96 + "buffer_load_format_xyz", v3f32 >; defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads < - "buffer_load_format_xyzw", VReg_128 + "buffer_load_format_xyzw", v4f32 >; defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores < - "buffer_store_format_x", VGPR_32 + "buffer_store_format_x", f32 >; defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores < - "buffer_store_format_xy", VReg_64 + "buffer_store_format_xy", v2f32 >; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores < - "buffer_store_format_xyz", VReg_96 + "buffer_store_format_xyz", v3f32 >; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < - "buffer_store_format_xyzw", VReg_128 + "buffer_store_format_xyzw", v4f32 >; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_x", VGPR_32 + "buffer_load_format_d16_x", i32 >; defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xy", VReg_64 + "buffer_load_format_d16_xy", v2i32 >; defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyz", VReg_96 + "buffer_load_format_d16_xyz", v3i32 >; defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyzw", VReg_128 + "buffer_load_format_d16_xyzw", v4i32 >; defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_x", VGPR_32 + "buffer_store_format_d16_x", i32 >; defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xy", VReg_64 + "buffer_store_format_d16_xy", v2i32 >; defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyz", VReg_96 + "buffer_store_format_d16_xyz", v3i32 >; defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyzw", VReg_128 + "buffer_store_format_d16_xyzw", v4i32 >; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_x", VGPR_32 + "buffer_load_format_d16_x", f16 >; defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xy", VGPR_32 + "buffer_load_format_d16_xy", v2f16 >; defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyz", VReg_64 + "buffer_load_format_d16_xyz", v3f16 >; defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyzw", VReg_64 + "buffer_load_format_d16_xyzw", v4f16 >; defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_x", VGPR_32 + "buffer_store_format_d16_x", f16 >; defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xy", VGPR_32 + "buffer_store_format_d16_xy", v2f16 >; defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyz", VReg_64 + "buffer_store_format_d16_xyz", v3f16 >; defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyzw", VReg_64 + "buffer_store_format_d16_xyzw", v4f16 >; } // End HasPackedD16VMem. defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ubyte", VGPR_32, i32 + "buffer_load_ubyte", i32 >; defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sbyte", VGPR_32, i32 + "buffer_load_sbyte", i32 >; defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ushort", VGPR_32, i32 + "buffer_load_ushort", i32 >; defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sshort", VGPR_32, i32 + "buffer_load_sshort", i32 >; defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds < - "buffer_load_dword", VGPR_32, i32 + "buffer_load_dword", i32 >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", VReg_64, v2i32 + "buffer_load_dwordx2", v2i32 >; defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, v3i32 + "buffer_load_dwordx3", v3i32 >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", VReg_128, v4i32 + "buffer_load_dwordx4", v4i32 >; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; @@ -867,111 +896,111 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; // in at least GFX8+ chips. See Bug 37653. let SubtargetPredicate = isGFX8GFX9 in { defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1 + "buffer_load_dwordx2", v2i32, null_frag, 0, 1 >; defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1 + "buffer_load_dwordx3", v3i32, null_frag, 0, 1 >; defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1 + "buffer_load_dwordx4", v4i32, null_frag, 0, 1 >; } defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < - "buffer_store_byte", VGPR_32, i32, truncstorei8_global + "buffer_store_byte", i32, truncstorei8_global >; defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores < - "buffer_store_short", VGPR_32, i32, truncstorei16_global + "buffer_store_short", i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores < - "buffer_store_dword", VGPR_32, i32, store_global + "buffer_store_dword", i32, store_global >; defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx2", VReg_64, v2i32, store_global + "buffer_store_dwordx2", v2i32, store_global >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", VReg_96, v3i32, store_global + "buffer_store_dwordx3", v3i32, store_global >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx4", VReg_128, v4i32, store_global + "buffer_store_dwordx4", v4i32, store_global >; defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global + "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32 >; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag >; defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics < - "buffer_atomic_add", VGPR_32, i32, atomic_add_global + "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32 >; defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global + "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32 >; defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin", VGPR_32, i32, atomic_min_global + "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32 >; defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global + "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32 >; defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax", VGPR_32, i32, atomic_max_global + "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32 >; defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global + "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32 >; defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics < - "buffer_atomic_and", VGPR_32, i32, atomic_and_global + "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32 >; defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics < - "buffer_atomic_or", VGPR_32, i32, atomic_or_global + "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32 >; defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global + "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32 >; defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global + "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32 >; defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global + "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32 >; defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global + "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64 >; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag >; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global + "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64 >; defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global + "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64 >; defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global + "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64 >; defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global + "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64 >; defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global + "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64 >; defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global + "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64 >; defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global + "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64 >; defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global + "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64 >; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global + "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64 >; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global + "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64 >; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global + "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64 >; let SubtargetPredicate = isGFX8GFX9 in { @@ -981,58 +1010,75 @@ def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; let SubtargetPredicate = isGFX6 in { // isn't on CI & VI /* defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">; -defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">; -defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">; -defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">; defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">; -defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">; -defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">; -defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">; */ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; } +let SubtargetPredicate = isGFX6GFX7GFX10 in { + +defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < + "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag +>; +defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmin", VGPR_32, f32, null_frag +>; +defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmax", VGPR_32, f32, null_frag +>; +defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag +>; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmin_x2", VReg_64, f64, null_frag +>; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmax_x2", VReg_64, f64, null_frag +>; + +} + let SubtargetPredicate = HasD16LoadStore in { defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1 + "buffer_load_ubyte_d16", i32, null_frag, 1 >; defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1 + "buffer_load_ubyte_d16_hi", i32, null_frag, 1 >; defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1 + "buffer_load_sbyte_d16", i32, null_frag, 1 >; defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1 + "buffer_load_sbyte_d16_hi", i32, null_frag, 1 >; defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < - "buffer_load_short_d16", VGPR_32, i32, null_frag, 1 + "buffer_load_short_d16", i32, null_frag, 1 >; defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1 + "buffer_load_short_d16_hi", i32, null_frag, 1 >; defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < - "buffer_store_byte_d16_hi", VGPR_32, i32 + "buffer_store_byte_d16_hi", i32 >; defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores < - "buffer_store_short_d16_hi", VGPR_32, i32 + "buffer_store_short_d16_hi", i32 >; defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_hi_x", VGPR_32 + "buffer_load_format_d16_hi_x", i32 >; defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_hi_x", VGPR_32 + "buffer_store_format_d16_hi_x", i32 >; } // End HasD16LoadStore @@ -1043,10 +1089,10 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", let SubtargetPredicate = HasAtomicFaddInsts in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global + "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts @@ -1055,35 +1101,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < // MTBUF Instructions //===----------------------------------------------------------------------===// -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; - defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; - defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>; } // End HasPackedD16VMem. let SubtargetPredicate = isGFX7Plus in { @@ -1118,6 +1164,10 @@ def extract_dlc : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); }]>; +def extract_swz : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1125,33 +1175,37 @@ def extract_dlc : SDNodeXForm<imm, [{ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1182,8 +1236,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; @@ -1196,36 +1254,40 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">; multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1256,8 +1318,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; @@ -1273,32 +1339,32 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -1316,6 +1382,8 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">; defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">; defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">; defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">; +defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">; +defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">; defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">; @@ -1326,37 +1394,39 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">; multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -1370,8 +1440,8 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMI def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1382,8 +1452,8 @@ def : GCNPat< def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1394,8 +1464,8 @@ def : GCNPat< def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1406,8 +1476,8 @@ def : GCNPat< def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1419,8 +1489,8 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, @@ -1428,12 +1498,12 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins def : GCNPat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } @@ -1454,8 +1524,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, def : GCNPat < (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; } @@ -1478,12 +1548,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1493,12 +1563,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, ValueType vt, PatFrag ld_frag> { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; } @@ -1512,7 +1582,10 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>; + +foreach vt = Reg32Types.types in { defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; +} defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; @@ -1535,16 +1608,16 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D1 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { - // Store follows atomic op convention so address is forst + // Store follows atomic op convention so address is first def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1558,8 +1631,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, def : GCNPat < (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; } @@ -1573,13 +1646,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1587,7 +1660,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>; + +foreach vt = Reg32Types.types in { +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>; +} + defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>; defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>; defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>; @@ -1613,37 +1690,41 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1671,37 +1752,41 @@ let SubtargetPredicate = HasPackedD16VMem in { multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, imm), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - imm:$offset, imm:$format, imm:$cachepolicy, imm), + timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1957,10 +2042,9 @@ defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>; defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>; defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>; defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>; -// FIXME-GFX6-GFX7-GFX10: Add following instructions: -//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>; -//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>; -//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>; +defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>; +defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>; +defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>; defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>; @@ -1975,10 +2059,9 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>; // FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7. -// FIXME-GFX6-GFX7-GFX10: Add following instructions: -//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; -//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; -//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; +defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; @@ -2353,7 +2436,7 @@ let SubtargetPredicate = HasPackedD16VMem in { def MUBUFInfoTable : GenericTable { let FilterClass = "MUBUF_Pseudo"; let CppTypeName = "MUBUFInfo"; - let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"]; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getMUBUFOpcodeHelper"; @@ -2364,7 +2447,26 @@ def getMUBUFInfoFromOpcode : SearchIndex { let Key = ["Opcode"]; } -def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex { +def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex { let Table = MUBUFInfoTable; - let Key = ["BaseOpcode", "dwords"]; + let Key = ["BaseOpcode", "elements"]; +} + +def MTBUFInfoTable : GenericTable { + let FilterClass = "MTBUF_Pseudo"; + let CppTypeName = "MTBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMTBUFOpcodeHelper"; +} + +def getMTBUFInfoFromOpcode : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["BaseOpcode", "elements"]; } diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index c52eaaa3fdc5..816ec14a0e98 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -81,6 +81,17 @@ class DS_Real <DS_Pseudo ds> : // DS Pseudo instructions +class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs), + (ins rc:$data0, offset:$offset, gds:$gds), + "$data0$offset$gds"> { + + let has_addr = 0; + let has_data1 = 0; + let has_vdst = 0; +} + class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), @@ -317,13 +328,16 @@ class DS_GWS <string opName, dag ins, string asmOps> class DS_GWS_0D <string opName> : DS_GWS<opName, - (ins offset:$offset, gds:$gds), "$offset gds">; + (ins offset:$offset, gds:$gds), "$offset gds"> { + let hasSideEffects = 1; +} class DS_GWS_1D <string opName> : DS_GWS<opName, (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> { let has_gws_data0 = 1; + let hasSideEffects = 1; } class DS_VOID <string opName> : DS_Pseudo<opName, @@ -391,11 +405,12 @@ def DS_WRITE_B8_D16_HI : DS_1A1D_NORET<"ds_write_b8_d16_hi">; def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">; } +} // End has_m0_read = 0 + let SubtargetPredicate = HasDSAddTid in { -def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">; +def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">; } -} // End has_m0_read = 0 } // End mayLoad = 0 defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; @@ -540,13 +555,14 @@ def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">; def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">; def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">; } +} // End has_m0_read = 0 let SubtargetPredicate = HasDSAddTid in { -def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">; -} -} // End has_m0_read = 0 +def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">; } +} // End mayStore = 0 + def DS_CONSUME : DS_0A_RET<"ds_consume">; def DS_APPEND : DS_0A_RET<"ds_append">; def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; @@ -600,13 +616,13 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; //===----------------------------------------------------------------------===// def : GCNPat < - (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), + (int_amdgcn_ds_swizzle i32:$src, timm:$offset16), (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < - (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 gds)) + (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))), + (inst $ptr, offset:$offset, (i1 gds)) >; multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { @@ -621,8 +637,8 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), - (inst $ptr, (as_i16imm $offset), (i1 0), $in) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in), + (inst $ptr, offset:$offset, (i1 0), $in) >; defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">; @@ -636,13 +652,20 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">; -defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">; + +foreach vt = Reg32Types.types in { +defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">; +} + defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">; defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">; let AddedComplexity = 100 in { -defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">; +foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">; +} + defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">; } // End AddedComplexity = 100 @@ -664,8 +687,8 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>; } class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < - (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) >; multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { @@ -681,8 +704,8 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { // Irritatingly, atomic_store reverses the order of operands from a // normal store. class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, $value, offset:$offset, (i1 0)) >; multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { @@ -699,9 +722,13 @@ defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">; defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">; -defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">; + +foreach vt = VGPR_32.RegTypes in { +defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">; +} + +defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">; +defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">; let OtherPredicates = [D16PreservesUnusedBits] in { def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>; @@ -736,46 +763,49 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>; let AddedComplexity = 100 in { -defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">; +foreach vt = VReg_64.RegTypes in { +defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">; +} + defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">; } // End AddedComplexity = 100 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) >; multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local")>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; } class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds)) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), + (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds)) >; multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local")>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; } diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4ec4be9bc485..ec2e2c4e8b71 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1095,6 +1095,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { case 106: return createRegOperand(VCC); case 108: return createRegOperand(TBA); case 110: return createRegOperand(TMA); + case 125: return createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); @@ -1172,7 +1173,8 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { int TTmpIdx = getTTmpIdx(Val); if (TTmpIdx >= 0) { - return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx); + auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32); + return createSRegOperand(TTmpClsId, TTmpIdx); } else if (Val > SGPR_MAX) { return IsWave64 ? decodeSpecialReg64(Val) : decodeSpecialReg32(Val); diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 0550092ce1d6..792e26d21f98 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -322,46 +322,46 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$ defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN, RAT_ATOMIC_XCHG_INT_NORET, - atomic_swap_global_ret, - atomic_swap_global_noret>; + atomic_swap_global_ret_32, + atomic_swap_global_noret_32>; defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET, - atomic_add_global_ret, atomic_add_global_noret>; + atomic_load_add_global_ret_32, atomic_load_add_global_noret_32>; defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET, - atomic_sub_global_ret, atomic_sub_global_noret>; + atomic_load_sub_global_ret_32, atomic_load_sub_global_noret_32>; defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN, RAT_ATOMIC_MIN_INT_NORET, - atomic_min_global_ret, atomic_min_global_noret>; + atomic_load_min_global_ret_32, atomic_load_min_global_noret_32>; defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN, RAT_ATOMIC_MIN_UINT_NORET, - atomic_umin_global_ret, atomic_umin_global_noret>; + atomic_load_umin_global_ret_32, atomic_load_umin_global_noret_32>; defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN, RAT_ATOMIC_MAX_INT_NORET, - atomic_max_global_ret, atomic_max_global_noret>; + atomic_load_max_global_ret_32, atomic_load_max_global_noret_32>; defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN, RAT_ATOMIC_MAX_UINT_NORET, - atomic_umax_global_ret, atomic_umax_global_noret>; + atomic_load_umax_global_ret_32, atomic_load_umax_global_noret_32>; defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET, - atomic_and_global_ret, atomic_and_global_noret>; + atomic_load_and_global_ret_32, atomic_load_and_global_noret_32>; defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET, - atomic_or_global_ret, atomic_or_global_noret>; + atomic_load_or_global_ret_32, atomic_load_or_global_noret_32>; defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET, - atomic_xor_global_ret, atomic_xor_global_noret>; + atomic_load_xor_global_ret_32, atomic_load_xor_global_noret_32>; defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, RAT_ATOMIC_INC_UINT_NORET, - atomic_add_global_ret, - atomic_add_global_noret, 1>; + atomic_load_add_global_ret_32, + atomic_load_add_global_noret_32, 1>; defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, RAT_ATOMIC_INC_UINT_NORET, - atomic_sub_global_ret, - atomic_sub_global_noret, -1>; + atomic_load_sub_global_ret_32, + atomic_load_sub_global_noret_32, -1>; defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, RAT_ATOMIC_DEC_UINT_NORET, - atomic_add_global_ret, - atomic_add_global_noret, -1>; + atomic_load_add_global_ret_32, + atomic_load_add_global_noret_32, -1>; defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, RAT_ATOMIC_DEC_UINT_NORET, - atomic_sub_global_ret, - atomic_sub_global_noret, 1>; + atomic_load_sub_global_ret_32, + atomic_load_sub_global_noret_32, 1>; // Should be predicated on FeatureFP64 // def FMA_64 : R600_3OP < @@ -628,37 +628,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", [(truncstorei16_local i32:$src1, i32:$src0)] >; def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", - [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))] >; def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", - [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))] >; def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", - [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))] >; def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", - [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))] >; def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", - [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))] >; def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", - [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))] >; def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", - [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))] >; def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", - [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))] >; def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", - [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))] >; def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", - [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))] >; def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))] + [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))] >; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))] diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 889f60dae920..80ee17eba141 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -270,7 +270,7 @@ multiclass FLAT_Atomic_Pseudo< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = getIsFP<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc), @@ -300,7 +300,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = getIsFP<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), @@ -333,7 +333,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = getIsFP<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), @@ -564,76 +564,76 @@ defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswa v2i64, VReg_128>; defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap", - VGPR_32, i32, atomic_swap_global>; + VGPR_32, i32, atomic_swap_global_32>; defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2", - VReg_64, i64, atomic_swap_global>; + VReg_64, i64, atomic_swap_global_64>; defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add", - VGPR_32, i32, atomic_add_global>; + VGPR_32, i32, atomic_load_add_global_32>; defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub", - VGPR_32, i32, atomic_sub_global>; + VGPR_32, i32, atomic_load_sub_global_32>; defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin", - VGPR_32, i32, atomic_min_global>; + VGPR_32, i32, atomic_load_min_global_32>; defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin", - VGPR_32, i32, atomic_umin_global>; + VGPR_32, i32, atomic_load_umin_global_32>; defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax", - VGPR_32, i32, atomic_max_global>; + VGPR_32, i32, atomic_load_max_global_32>; defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax", - VGPR_32, i32, atomic_umax_global>; + VGPR_32, i32, atomic_load_umax_global_32>; defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and", - VGPR_32, i32, atomic_and_global>; + VGPR_32, i32, atomic_load_and_global_32>; defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or", - VGPR_32, i32, atomic_or_global>; + VGPR_32, i32, atomic_load_or_global_32>; defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor", - VGPR_32, i32, atomic_xor_global>; + VGPR_32, i32, atomic_load_xor_global_32>; defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc", - VGPR_32, i32, atomic_inc_global>; + VGPR_32, i32, atomic_inc_global_32>; defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec", - VGPR_32, i32, atomic_dec_global>; + VGPR_32, i32, atomic_dec_global_32>; defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2", - VReg_64, i64, atomic_add_global>; + VReg_64, i64, atomic_load_add_global_64>; defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2", - VReg_64, i64, atomic_sub_global>; + VReg_64, i64, atomic_load_sub_global_64>; defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2", - VReg_64, i64, atomic_min_global>; + VReg_64, i64, atomic_load_min_global_64>; defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2", - VReg_64, i64, atomic_umin_global>; + VReg_64, i64, atomic_load_umin_global_64>; defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2", - VReg_64, i64, atomic_max_global>; + VReg_64, i64, atomic_load_max_global_64>; defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2", - VReg_64, i64, atomic_umax_global>; + VReg_64, i64, atomic_load_umax_global_64>; defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2", - VReg_64, i64, atomic_and_global>; + VReg_64, i64, atomic_load_and_global_64>; defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2", - VReg_64, i64, atomic_or_global>; + VReg_64, i64, atomic_load_or_global_64>; defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2", - VReg_64, i64, atomic_xor_global>; + VReg_64, i64, atomic_load_xor_global_64>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", - VReg_64, i64, atomic_inc_global>; + VReg_64, i64, atomic_inc_global_64>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", - VReg_64, i64, atomic_dec_global>; + VReg_64, i64, atomic_dec_global_64>; } // End is_flat_global = 1 } // End SubtargetPredicate = HasFlatGlobalInsts @@ -686,10 +686,10 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_add_f32", VGPR_32, f32, atomic_add_global + "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts @@ -777,8 +777,6 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>; @@ -787,41 +785,50 @@ def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>; def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>; + +foreach vt = Reg32Types.types in { +def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>; +def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>; +} + +foreach vt = VReg_64.RegTypes in { +def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>; +} + def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>; def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>; def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>; def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>; -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>; def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>; def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>; def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; @@ -847,9 +854,6 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } // End OtherPredicates = [HasFlatAddressSpace] -def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>; -def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>; - let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>; @@ -863,8 +867,16 @@ def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>; -def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>; -def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>; +foreach vt = Reg32Types.types in { +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>; +} + +foreach vt = VReg_64.RegTypes in { +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>; +} + def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>; @@ -875,8 +887,6 @@ def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>; def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>; def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>; def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>; def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>; def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>; @@ -902,36 +912,36 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>; def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>; def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>; -def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>; -def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 @@ -1174,7 +1184,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : let AssemblerPredicate = isGFX10Plus; let DecoderNamespace = "GFX10"; - let Inst{11-0} = {offset{12}, offset{10-0}}; + let Inst{11-0} = offset{11-0}; let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue); let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d); let Inst{55} = 0; diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp index e1845e2e8e87..98678873e37c 100644 --- a/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -41,6 +41,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -155,8 +156,6 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, RegSubRegPair CombOldVGPR, bool CombBCZ) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); - assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == - TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp); @@ -178,7 +177,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, if (OldIdx != -1) { assert(OldIdx == NumOperands); assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)); - DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg); + auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI); + DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, + CombOldVGPR.SubReg); ++NumOperands; } else { // TODO: this discards MAC/FMA instructions for now, let's add it later @@ -195,6 +196,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); DPPInst.addImm(Mod0->getImm()); ++NumOperands; + } else if (AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src0_modifiers) != -1) { + DPPInst.addImm(0); + ++NumOperands; } auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(Src0); @@ -214,6 +219,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); DPPInst.addImm(Mod1->getImm()); ++NumOperands; + } else if (AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src1_modifiers) != -1) { + DPPInst.addImm(0); + ++NumOperands; } if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { @@ -344,6 +353,10 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); assert(DstOpnd && DstOpnd->isReg()); auto DPPMovReg = DstOpnd->getReg(); + if (DPPMovReg.isPhysical()) { + LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n"); + return false; + } if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" " for all uses\n"); @@ -362,7 +375,13 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { bool BoundCtrlZero = BCZOpnd->getImm(); auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); + auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(OldOpnd && OldOpnd->isReg()); + assert(SrcOpnd && SrcOpnd->isReg()); + if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) { + LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n"); + return false; + } auto * const OldOpndValue = getOldOpndValue(*OldOpnd); // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else @@ -408,6 +427,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; + DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos; auto CombOldVGPR = getRegSubRegPair(*OldOpnd); // try to reuse previous old reg if its undefined (IMPLICIT_DEF) if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef @@ -420,13 +440,49 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { OrigMIs.push_back(&MovMI); bool Rollback = true; + SmallVector<MachineOperand*, 16> Uses; + for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { + Uses.push_back(&Use); + } + + while (!Uses.empty()) { + MachineOperand *Use = Uses.pop_back_val(); Rollback = true; - auto &OrigMI = *Use.getParent(); + auto &OrigMI = *Use->getParent(); LLVM_DEBUG(dbgs() << " try: " << OrigMI); auto OrigOp = OrigMI.getOpcode(); + if (OrigOp == AMDGPU::REG_SEQUENCE) { + Register FwdReg = OrigMI.getOperand(0).getReg(); + unsigned FwdSubReg = 0; + + if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) { + LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" + " for all uses\n"); + break; + } + + unsigned OpNo, E = OrigMI.getNumOperands(); + for (OpNo = 1; OpNo < E; OpNo += 2) { + if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) { + FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm(); + break; + } + } + + if (!FwdSubReg) + break; + + for (auto &Op : MRI->use_nodbg_operands(FwdReg)) { + if (Op.getSubReg() == FwdSubReg) + Uses.push_back(&Op); + } + RegSeqWithOpNos[&OrigMI].push_back(OpNo); + continue; + } + if (TII->isVOP3(OrigOp)) { if (!TII->hasVALU32BitEncoding(OrigOp)) { LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); @@ -447,14 +503,14 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } LLVM_DEBUG(dbgs() << " combining: " << OrigMI); - if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } } else if (OrigMI.isCommutable() && - &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { auto *BB = OrigMI.getParent(); auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); BB->insert(OrigMI, NewMI); @@ -475,9 +531,22 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { OrigMIs.push_back(&OrigMI); } + Rollback |= !Uses.empty(); + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) MI->eraseFromParent(); + if (!Rollback) { + for (auto &S : RegSeqWithOpNos) { + if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) { + S.first->eraseFromParent(); + continue; + } + while (!S.second.empty()) + S.first->getOperand(S.second.pop_back_val()).setIsUndef(true); + } + } + return !Rollback; } @@ -498,6 +567,13 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { Changed = true; ++NumDPPMovsCombined; + } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { + auto Split = TII->expandMovDPP64(MI); + for (auto M : { Split.first, Split.second }) { + if (combineDPPMov(*M)) + ++NumDPPMovsCombined; + } + Changed = true; } } } diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 885239e2faed..9528aee4c50e 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -726,7 +726,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, if (!TRI->isVGPR(MRI, Def.getReg())) return WaitStatesNeeded; - unsigned Reg = Def.getReg(); + Register Reg = Def.getReg(); auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { int DataIdx = createsVALUHazard(*MI); return DataIdx >= 0 && @@ -792,7 +792,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) return 0; - unsigned LaneSelectReg = LaneSelectOp->getReg(); + Register LaneSelectReg = LaneSelectOp->getReg(); auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; @@ -891,7 +891,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* // which is always a VGPR and available. auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); - unsigned Reg = Src0->getReg(); + Register Reg = Src0->getReg(); bool IsUndef = Src0->isUndef(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32)) @@ -952,6 +952,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { unsigned SDSTName; switch (MI->getOpcode()) { case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READLANE_B32_gfx10: case AMDGPU::V_READFIRSTLANE_B32: SDSTName = AMDGPU::OpName::vdst; break; @@ -976,7 +977,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { if (!SDST) return false; - const unsigned SDSTReg = SDST->getReg(); + const Register SDSTReg = SDST->getReg(); auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); }; @@ -1251,14 +1252,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; const int MaxWaitStates = 18; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); unsigned HazardDefLatency = 0; auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] (MachineInstr *MI) { if (!IsMFMAFn(MI)) return false; - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); if (DstReg == Reg) return false; HazardDefLatency = std::max(HazardDefLatency, @@ -1304,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) return false; - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); return TRI.regsOverlap(Reg, DstReg); }; @@ -1330,14 +1331,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; const int MaxWaitStates = 13; - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); unsigned HazardDefLatency = 0; auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] (MachineInstr *MI) { if (!IsMFMAFn(MI)) return false; - unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); HazardDefLatency = std::max(HazardDefLatency, TSchedModel.computeInstrLatency(MI)); return TRI.regsOverlap(Reg, DstReg); @@ -1376,7 +1377,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) continue; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); const int AccVgprReadLdStWaitStates = 2; const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp index 1eb617640c32..39072af7d871 100644 --- a/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" using namespace llvm; diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 3525174223bd..90ab6a14ce20 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -237,7 +237,7 @@ public: GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, StrategyKind S) - : BaseClass(C, llvm::make_unique<SchedStrategyStub>()) + : BaseClass(C, std::make_unique<SchedStrategyStub>()) , Context(C) , Strategy(S) , UPTracker(*LIS) { diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp index 51c4c99cfb18..36a8f74150f5 100644 --- a/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -173,11 +173,11 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { bool NSA = false; for (unsigned I = 0; I < Info->VAddrDwords; ++I) { const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); - unsigned Reg = Op.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + Register Reg = Op.getReg(); + if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) return NSA_Status::FIXED; - unsigned PhysReg = VRM->getPhys(Reg); + Register PhysReg = VRM->getPhys(Reg); if (!Fast) { if (!PhysReg) @@ -276,7 +276,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { SlotIndex MinInd, MaxInd; for (unsigned I = 0; I < Info->VAddrDwords; ++I) { const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); LiveInterval *LI = &LIS->getInterval(Reg); if (llvm::find(Intervals, LI) != Intervals.end()) { // Same register used, unable to make sequential diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp index f0d47eaa4ed1..2927d4eb745a 100644 --- a/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -230,7 +230,7 @@ private: public: Printable printReg(unsigned Reg, unsigned SubReg = 0) const { return Printable([Reg, SubReg, this](raw_ostream &OS) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (Register::isPhysicalRegister(Reg)) { OS << llvm::printReg(Reg, TRI); return; } @@ -275,7 +275,7 @@ char GCNRegBankReassign::ID = 0; char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { - assert (TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); @@ -293,7 +293,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, int Bank) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { if (!VRM->isAssignedReg(Reg)) return 0; @@ -364,7 +364,7 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, if (!Op.isReg() || Op.isUndef()) continue; - unsigned R = Op.getReg(); + Register R = Op.getReg(); if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) continue; @@ -420,12 +420,12 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, } bool GCNRegBankReassign::isReassignable(unsigned Reg) const { - if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) return false; const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); - unsigned PhysReg = VRM->getPhys(Reg); + Register PhysReg = VRM->getPhys(Reg); if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) return false; @@ -654,7 +654,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { } std::sort(BankStalls.begin(), BankStalls.end()); - unsigned OrigReg = VRM->getPhys(C.Reg); + Register OrigReg = VRM->getPhys(C.Reg); LRM->unassign(LI); while (!BankStalls.empty()) { BankStall BS = BankStalls.pop_back_val(); diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 39460fbd8a84..d593204cba05 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -40,7 +40,7 @@ void llvm::printLivesAt(SlotIndex SI, << *LIS.getInstructionFromIndex(SI); unsigned Num = 0; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + const unsigned Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; const auto &LI = LIS.getInterval(Reg); @@ -84,7 +84,7 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) { - assert(TargetRegisterInfo::isVirtualRegister(Reg)); + assert(Register::isVirtualRegister(Reg)); const auto RC = MRI.getRegClass(Reg); auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? @@ -183,7 +183,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST, #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { - OS << "VGPRs: " << getVGPRNum(); + OS << "VGPRs: " << Value[VGPR32] << ' '; + OS << "AGPRs: " << Value[AGPR32]; if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; OS << ", SGPRs: " << getSGPRNum(); if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; @@ -196,8 +197,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { static LaneBitmask getDefRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI) { - assert(MO.isDef() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); + assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg())); // We don't rely on read-undef flag because in case of tentative schedule // tracking it isn't set correctly yet. This works correctly however since @@ -210,8 +210,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO, static LaneBitmask getUsedRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI, const LiveIntervals &LIS) { - assert(MO.isUse() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); + assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg())); if (auto SubReg = MO.getSubReg()) return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); @@ -232,7 +231,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { SmallVector<RegisterMaskPair, 8> Res; for (const auto &MO : MI.operands()) { - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg())) continue; if (!MO.isUse() || !MO.readsReg()) continue; @@ -278,7 +277,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, const MachineRegisterInfo &MRI) { GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - auto Reg = TargetRegisterInfo::index2VirtReg(I); + auto Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); @@ -329,8 +328,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { MaxPressure = max(AtMIPressure, MaxPressure); for (const auto &MO : MI.defs()) { - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) || - MO.isDead()) + if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead()) continue; auto Reg = MO.getReg(); @@ -408,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() { for (const auto &MO : LastTrackedMI->defs()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg)) continue; auto &LiveMask = LiveRegs[Reg]; auto PrevMask = LiveMask; @@ -500,7 +498,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, const MachineRegisterInfo &MRI) { const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + unsigned Reg = Register::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index e4894418b943..5862cdb04166 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -214,7 +214,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap; SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - auto Reg = TargetRegisterInfo::index2VirtReg(I); + auto Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; auto &LI = LIS.getInterval(Reg); diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 4ea990ae490e..973491a70d3c 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -71,8 +71,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); - std::vector<unsigned> Pressure; - std::vector<unsigned> MaxPressure; + Pressure.clear(); + MaxPressure.clear(); if (AtTop) TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); @@ -103,10 +103,10 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // the analysis to look through dependencies to find the path with the least // register pressure. - // We only need to update the RPDelata for instructions that increase - // register pressure. Instructions that decrease or keep reg pressure - // the same will be marked as RegExcess in tryCandidate() when they - // are compared with instructions that increase the register pressure. + // We only need to update the RPDelta for instructions that increase register + // pressure. Instructions that decrease or keep reg pressure the same will be + // marked as RegExcess in tryCandidate() when they are compared with + // instructions that increase the register pressure. if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet()); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); @@ -160,6 +160,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); Cand.setBest(TryCand); + LLVM_DEBUG(traceCandidate(Cand)); } } } @@ -195,6 +196,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); +#ifndef NDEBUG + if (VerifyScheduling) { + SchedCandidate TCand; + TCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); + assert(TCand.SU == BotCand.SU && + "Last pick result should correspond to re-picking right now"); + } +#endif } // Check if the top Q has a better candidate. @@ -206,6 +216,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); +#ifndef NDEBUG + if (VerifyScheduling) { + SchedCandidate TCand; + TCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); + assert(TCand.SU == TopCand.SU && + "Last pick result should correspond to re-picking right now"); + } +#endif } // Pick best from BotCand and TopCand. diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index eaf3dee9ba5d..dd687a930c79 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -40,6 +40,9 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure); + std::vector<unsigned> Pressure; + std::vector<unsigned> MaxPressure; + unsigned SGPRExcessLimit; unsigned VGPRExcessLimit; unsigned SGPRCriticalLimit; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 57c0ba26cc3a..1f94ab799122 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -109,7 +109,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext *Ctx) { int64_t SignedValue = static_cast<int64_t>(Value); - switch (static_cast<unsigned>(Fixup.getKind())) { + switch (Fixup.getTargetKind()) { case AMDGPU::fixup_si_sopp_br: { int64_t BrImm = (SignedValue - 4) / 4; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 6549a8d7d592..d352219a7a98 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -87,7 +87,7 @@ std::unique_ptr<MCObjectTargetWriter> llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend, uint8_t ABIVersion) { - return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, + return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, HasRelocationAddend, ABIVersion); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 01b53432cbb7..a9888e6ed924 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "slc"); } +void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { +} + void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "tfe"); @@ -292,35 +296,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } #endif - unsigned AltName = AMDGPU::Reg32; - - if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg64; - else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg128; - else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg96; - else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg160; - else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg256; - else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg512; - else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg1024; - - O << getRegisterName(RegNo, AltName); + O << getRegisterName(RegNo); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -623,9 +599,11 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_CNDMASK_B32_dpp_gfx10: case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_CNDMASK_B32_dpp8_gfx10: case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: @@ -689,6 +667,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, switch (MI->getOpcode()) { default: break; + case AMDGPU::V_CNDMASK_B32_sdwa_gfx10: case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index b544d1ef3605..66b70831ff9e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -12,7 +12,6 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H -#include "AMDGPUMCTargetDesc.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { @@ -26,8 +25,7 @@ public: //Autogenerated by tblgen void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = AMDGPU::NoRegAltName); + static const char *getRegisterName(unsigned RegNo); void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; @@ -74,6 +72,8 @@ private: raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 8f11433476f4..c15da8075a34 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -250,7 +250,7 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { const uint32_t Encoded_s_code_end = 0xbf9f0000; OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; - OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n'; + OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n'; return true; } @@ -602,7 +602,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() { MCStreamer &OS = getStreamer(); OS.PushSection(); OS.EmitValueToAlignment(64, Encoded_s_code_end, 4); - for (unsigned I = 0; I < 32; ++I) + for (unsigned I = 0; I < 48; ++I) OS.EmitIntValue(Encoded_s_code_end, 4); OS.PopSection(); return true; diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 4735e6cb2446..f33ad950d5d9 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -26,7 +26,7 @@ def MIMGEncoding : GenericEnum { // Represent an ISA-level opcode, independent of the encoding and the // vdata/vaddr size. -class MIMGBaseOpcode { +class MIMGBaseOpcode : PredicateControl { MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME); bit Store = 0; bit Atomic = 0; @@ -291,7 +291,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm, multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0, bit isResInfo = 0> { - def "" : MIMGBaseOpcode, PredicateControl { + def "" : MIMGBaseOpcode { let Coordinates = !if(isResInfo, 0, 1); let LodOrClampOrMip = mip; let HasD16 = has_d16; diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp index 3fb18862fca8..b29cd75f75cf 100644 --- a/lib/Target/AMDGPU/R600AsmPrinter.cpp +++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -104,7 +104,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) { // Functions needs to be cacheline (256B) aligned. - MF.ensureAlignment(8); + MF.ensureAlignment(Align(256)); SetupMachineFunction(MF); diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 8098b81d1ea2..e4160ac11c86 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -303,7 +303,7 @@ private: if (!MO.isReg()) continue; if (MO.isDef()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (R600::R600_Reg128RegClass.contains(Reg)) DstMI = Reg; else @@ -312,7 +312,7 @@ private: &R600::R600_Reg128RegClass); } if (MO.isUse()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (R600::R600_Reg128RegClass.contains(Reg)) SrcMI = Reg; else diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index c6e8a060d8a0..fd75c41040e1 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -135,7 +135,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; for (unsigned Chan = 0; Chan < 4; ++Chan) { @@ -155,12 +155,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { unsigned Opcode = BMI->getOpcode(); // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. - unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, R600::OpName::src0)) - .getReg(); - unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, R600::OpName::src1)) - .getReg(); + Register Src0 = + BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src0)) + .getReg(); + Register Src1 = + BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src1)) + .getReg(); (void) Src0; (void) Src1; if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && @@ -205,10 +205,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // T0_Z = CUBE T1_X, T1_Z // T0_W = CUBE T1_Y, T1_Z for (unsigned Chan = 0; Chan < 4; Chan++) { - unsigned DstReg = MI.getOperand( - TII->getOperandIdx(MI, R600::OpName::dst)).getReg(); - unsigned Src0 = MI.getOperand( - TII->getOperandIdx(MI, R600::OpName::src0)).getReg(); + Register DstReg = + MI.getOperand(TII->getOperandIdx(MI, R600::OpName::dst)).getReg(); + Register Src0 = + MI.getOperand(TII->getOperandIdx(MI, R600::OpName::src0)).getReg(); unsigned Src1 = 0; // Determine the correct source registers diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h index 950e238f4979..283e4d1935ea 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.h +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -15,9 +15,9 @@ namespace llvm { class R600FrameLowering : public AMDGPUFrameLowering { public: - R600FrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1) : - AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + R600FrameLowering(StackDirection D, Align StackAl, int LAO, + Align TransAl = Align::None()) + : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~R600FrameLowering() override; void emitPrologue(MachineFunction &MF, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index f80a53ba1dc6..659458b0b752 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -41,6 +41,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" +#include "llvm/Support/MathExtras.h" #include <cassert> #include <cstdint> #include <iterator> @@ -334,8 +335,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case R600::MASK_WRITE: { - unsigned maskedRegister = MI.getOperand(0).getReg(); - assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + Register maskedRegister = MI.getOperand(0).getReg(); + assert(Register::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); TII->addFlag(*defInstr, 0, MO_FLAG_MASK); break; @@ -782,7 +783,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, - DAG.getConstantFP(3.14159265359, DL, MVT::f32)); + DAG.getConstantFP(numbers::pif, DL, MVT::f32)); } SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index d9e839fe2035..04a5e93f6213 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -97,8 +97,8 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), E = MBBI->operands_end(); I != E; ++I) { - if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && - I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) + if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() && + RI.isPhysRegLiveAcrossClauses(I->getReg())) return false; } return true; @@ -242,8 +242,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { for (MachineInstr::const_mop_iterator I = MI.operands_begin(), E = MI.operands_end(); I != E; ++I) { - if (!I->isReg() || !I->isUse() || - TargetRegisterInfo::isVirtualRegister(I->getReg())) + if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg())) continue; if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg())) @@ -294,7 +293,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { for (unsigned j = 0; j < 8; j++) { MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); @@ -317,7 +316,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { if (SrcIdx < 0) break; MachineOperand &MO = MI.getOperand(SrcIdx); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); @@ -348,7 +347,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, unsigned i = 0; for (const auto &Src : getSrcs(MI)) { ++i; - unsigned Reg = Src.first->getReg(); + Register Reg = Src.first->getReg(); int Index = RI.getEncodingValue(Reg) & 0xff; if (Reg == R600::OQAP) { Result.push_back(std::make_pair(Index, 0U)); @@ -865,7 +864,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { if (idx < 0) return false; - unsigned Reg = MI.getOperand(idx).getReg(); + Register Reg = MI.getOperand(idx).getReg(); switch (Reg) { default: return false; case R600::PRED_SEL_ONE: @@ -1038,7 +1037,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(), getIndirectAddrRegClass()->getRegister(Address)); @@ -1052,7 +1051,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), MI.getOperand(ValOpIdx).getReg()); @@ -1193,8 +1192,7 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); for (std::pair<unsigned, unsigned> LI : MRI.liveins()) { unsigned Reg = LI.first; - if (TargetRegisterInfo::isVirtualRegister(Reg) || - !IndirectRC->contains(Reg)) + if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg)) continue; unsigned RegIndex; diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index 34267a909b5e..7569a2629539 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -183,7 +183,7 @@ isPhysicalRegCopy(MachineInstr *MI) { if (MI->getOpcode() != R600::COPY) return false; - return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); + return !Register::isVirtualRegister(MI->getOperand(1).getReg()); } void R600SchedStrategy::releaseTopNode(SUnit *SU) { @@ -209,7 +209,7 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) { bool R600SchedStrategy::regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Register::isVirtualRegister(Reg)) { return RC->contains(Reg); } else { return MRI->getRegClass(Reg) == RC; @@ -270,7 +270,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { } // Is the result already member of a X/Y/Z/W class ? - unsigned DestReg = MI->getOperand(0).getReg(); + Register DestReg = MI->getOperand(0).getReg(); if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) return AluT_X; @@ -357,7 +357,7 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { if (DstIndex == -1) { return; } - unsigned DestReg = MI->getOperand(DstIndex).getReg(); + Register DestReg = MI->getOperand(DstIndex).getReg(); // PressureRegister crashes if an operand is def and used in the same inst // and we try to constraint its regclass for (MachineInstr::mop_iterator It = MI->operands_begin(), diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 9f1cb6582b5c..cec7f563f480 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -58,7 +58,7 @@ using namespace llvm; static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { assert(MRI.isSSA()); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return false; const MachineInstr *MI = MRI.getUniqueVRegDef(Reg); return MI && MI->isImplicitDef(); @@ -197,17 +197,17 @@ unsigned getReassignedChan( MachineInstr *R600VectorRegMerger::RebuildVector( RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const { - unsigned Reg = RSI->Instr->getOperand(0).getReg(); + Register Reg = RSI->Instr->getOperand(0).getReg(); MachineBasicBlock::iterator Pos = RSI->Instr; MachineBasicBlock &MBB = *Pos->getParent(); DebugLoc DL = Pos->getDebugLoc(); - unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); + Register SrcVec = BaseRSI->Instr->getOperand(0).getReg(); DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg; for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(), E = RSI->RegToChan.end(); It != E; ++It) { - unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); + Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); unsigned SubReg = (*It).first; unsigned Swizzle = (*It).second; unsigned Chan = getReassignedChan(RemapChan, Swizzle); @@ -350,7 +350,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { MachineInstr &MI = *MII; if (MI.getOpcode() != R600::REG_SEQUENCE) { if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { - unsigned Reg = MI.getOperand(1).getReg(); + Register Reg = MI.getOperand(1).getReg(); for (MachineRegisterInfo::def_instr_iterator It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); It != E; ++It) { @@ -363,7 +363,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { RegSeqInfo RSI(*MRI, &MI); // All uses of MI are swizzeable ? - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); if (!areAllUsesSwizzeable(Reg)) continue; diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index df200baf11c1..176269f9b68c 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -90,7 +90,7 @@ private: if (DstIdx == -1) { continue; } - unsigned Dst = BI->getOperand(DstIdx).getReg(); + Register Dst = BI->getOperand(DstIdx).getReg(); if (isTrans || TII->isTransOnly(*BI)) { Result[Dst] = R600::PS; continue; @@ -136,7 +136,7 @@ private: int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); if (OperandIdx < 0) continue; - unsigned Src = MI.getOperand(OperandIdx).getReg(); + Register Src = MI.getOperand(OperandIdx).getReg(); const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); if (It != PVs.end()) MI.getOperand(OperandIdx).setReg(It->second); diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index 685df74490fe..ef12c1d24594 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -93,7 +93,7 @@ const RegClassWeight &R600RegisterInfo::getRegClassWeight( } bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + assert(!Register::isVirtualRegister(Reg)); switch (Reg) { case R600::OQAP: diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp index f8094e35816c..ee011286b8ff 100644 --- a/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -129,7 +129,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { continue; // Create a register for the intialization value. - unsigned PrevDst = + Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); unsigned NewDst = 0; // Final initialized value will be in here @@ -150,7 +150,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); // Initialize dword - unsigned SubReg = + Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) .addImm(0); diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a0e1ec6ac235..23ef56afc39c 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -99,7 +99,10 @@ enum : uint64_t { FPAtomic = UINT64_C(1) << 53, // Is a MFMA instruction. - IsMAI = UINT64_C(1) << 54 + IsMAI = UINT64_C(1) << 54, + + // Is a DOT instruction. + IsDOT = UINT64_C(1) << 55 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -444,6 +447,7 @@ namespace DPP { enum DppCtrl : unsigned { QUAD_PERM_FIRST = 0, + QUAD_PERM_ID = 0xE4, // identity permutation QUAD_PERM_LAST = 0xFF, DPP_UNUSED1 = 0x100, ROW_SHL0 = 0x100, diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 624953963cf4..65286751c12d 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass { public: static char ID; + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + SIFixSGPRCopies() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; + void processPHINode(MachineInstr &MI); + StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -148,7 +154,7 @@ static bool hasVectorOperands(const MachineInstr &MI, const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + !Register::isVirtualRegister(MI.getOperand(i).getReg())) continue; if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) @@ -161,21 +167,19 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> getCopyRegClasses(const MachineInstr &Copy, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); + Register DstReg = Copy.getOperand(0).getReg(); + Register SrcReg = Copy.getOperand(1).getReg(); - const TargetRegisterClass *SrcRC = - TargetRegisterInfo::isVirtualRegister(SrcReg) ? - MRI.getRegClass(SrcReg) : - TRI.getPhysRegClass(SrcReg); + const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg) + ? MRI.getRegClass(SrcReg) + : TRI.getPhysRegClass(SrcReg); // We don't really care about the subregister here. // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); - const TargetRegisterClass *DstRC = - TargetRegisterInfo::isVirtualRegister(DstReg) ? - MRI.getRegClass(DstReg) : - TRI.getPhysRegClass(DstReg); + const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg) + ? MRI.getRegClass(DstReg) + : TRI.getPhysRegClass(DstReg); return std::make_pair(SrcRC, DstRC); } @@ -199,10 +203,10 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, const SIInstrInfo *TII) { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); auto &Src = MI.getOperand(1); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = Src.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - !TargetRegisterInfo::isVirtualRegister(DstReg)) + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = Src.getReg(); + if (!Register::isVirtualRegister(SrcReg) || + !Register::isVirtualRegister(DstReg)) return false; for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { @@ -238,7 +242,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, MachineRegisterInfo &MRI) { assert(MI.isRegSequence()); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) return false; @@ -250,7 +254,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, return false; // It is illegal to have vreg inputs to a physreg defining reg_sequence. - if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg())) + if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg())) return false; const TargetRegisterClass *SrcRC, *DstRC; @@ -281,7 +285,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, bool IsAGPR = TRI->hasAGPRs(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { - unsigned SrcReg = MI.getOperand(I).getReg(); + Register SrcReg = MI.getOperand(I).getReg(); unsigned SrcSubReg = MI.getOperand(I).getSubReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); @@ -291,7 +295,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); - unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + Register TmpReg = MRI.createVirtualRegister(NewSrcRC); BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) @@ -299,7 +303,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, if (IsAGPR) { const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); - unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC); + Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), @@ -315,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, return true; } -static bool phiHasVGPROperands(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - const SIRegisterInfo *TRI, - const SIInstrInfo *TII) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - unsigned Reg = PHI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) - return true; - } - return false; -} - -static bool phiHasBreakDef(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - SmallSet<unsigned, 8> &Visited) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - unsigned Reg = PHI.getOperand(i).getReg(); - if (Visited.count(Reg)) - continue; - - Visited.insert(Reg); - - MachineInstr *DefInstr = MRI.getVRegDef(Reg); - switch (DefInstr->getOpcode()) { - default: - break; - case AMDGPU::SI_IF_BREAK: - return true; - case AMDGPU::PHI: - if (phiHasBreakDef(*DefInstr, MRI, Visited)) - return true; - } - } - return false; -} - -static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, - const TargetRegisterInfo &TRI) { - for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), - E = MBB.end(); I != E; ++I) { - if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) - return true; - } - return false; -} - static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, const MachineInstr *MoveImm, const SIInstrInfo *TII, @@ -422,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { - return hasTerminatorThatModifiesExec(*MBB, *TRI); }); -} - // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -468,6 +420,7 @@ getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { // executioon. static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, + const TargetRegisterInfo *TRI, MachineDominatorTree &MDT, const TargetInstrInfo *TII) { // List of inits by immediate value. @@ -482,7 +435,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, for (auto &MI : MRI.def_instructions(Reg)) { MachineOperand *Imm = nullptr; - for (auto &MO: MI.operands()) { + for (auto &MO : MI.operands()) { if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { Imm = nullptr; @@ -587,8 +540,44 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } } - for (auto MI : MergedInstrs) - MI->removeFromParent(); + // Remove initializations that were merged into another. + for (auto &Init : Inits) { + auto &Defs = Init.second; + auto I = Defs.begin(); + while (I != Defs.end()) { + if (MergedInstrs.count(*I)) { + (*I)->eraseFromParent(); + I = Defs.erase(I); + } else + ++I; + } + } + + // Try to schedule SGPR initializations as early as possible in the MBB. + for (auto &Init : Inits) { + auto &Defs = Init.second; + for (auto MI : Defs) { + auto MBB = MI->getParent(); + MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); + MachineBasicBlock::reverse_iterator B(BoundaryMI); + // Check if B should actually be a boundary. If not set the previous + // instruction as the boundary instead. + if (!TII->isBasicBlockPrologue(*B)) + B++; + + auto R = std::next(MI->getReverseIterator()); + const unsigned Threshold = 50; + // Search until B or Threshold for a place to insert the initialization. + for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) + if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || + TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) + break; + + // Move to directly after R. + if (&*--R != MI) + MBB->splice(*R, MBB, MI); + } + } if (Changed) MRI.clearKillFlags(Reg); @@ -598,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); MDT = &getAnalysis<MachineDominatorTree>(); SmallVector<MachineInstr *, 16> Worklist; @@ -617,22 +606,39 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: { - // If the destination register is a physical register there isn't really - // much we can do to fix this. - if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) - continue; + Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; - std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); + + if (!Register::isVirtualRegister(DstReg)) { + // If the destination register is a physical register there isn't + // really much we can do to fix this. + // Some special instructions use M0 as an input. Some even only use + // the first lane. Insert a readfirstlane and hope for the best. + if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { + Register TmpReg + = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + BuildMI(MBB, MI, MI.getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + .add(MI.getOperand(1)); + MI.getOperand(1).setReg(TmpReg); + } + + continue; + } + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register SrcReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(SrcReg)) { TII->moveToVALU(MI, MDT); break; } - MachineInstr *DefMI = MRI.getVRegDef(SrcReg); + MachineInstr *DefMI = MRI->getVRegDef(SrcReg); unsigned SMovOp; int64_t Imm; // If we are just copying an immediate, we can replace the copy with @@ -651,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } case AMDGPU::PHI: { - unsigned Reg = MI.getOperand(0).getReg(); - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // We don't need to fix the PHI if the common dominator of the - // two incoming blocks terminates with a uniform branch. - bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { - MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); - MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - - if (!predsHasDivergentTerminator(MBB0, TRI) && - !predsHasDivergentTerminator(MBB1, TRI)) { - LLVM_DEBUG(dbgs() - << "Not fixing PHI for uniform branch: " << MI << '\n'); - break; - } - } - - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - SmallSet<unsigned, 8> Visited; - if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { - LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI, MDT); - } - + processPHINode(MI); break; } case AMDGPU::REG_SEQUENCE: if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || !hasVectorOperands(MI, TRI)) { - foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); + foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); continue; } @@ -724,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVectorRegisters(Src0RC) || TRI->hasVectorRegisters(Src1RC))) { @@ -735,12 +684,159 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } break; } + case AMDGPU::V_WRITELANE_B32: { + // Some architectures allow more than one constant bus access without + // SGPR restriction + if (ST.getConstantBusLimit(MI.getOpcode()) != 1) + break; + + // Writelane is special in that it can use SGPR and M0 (which would + // normally count as using the constant bus twice - but in this case it + // is allowed since the lane selector doesn't count as a use of the + // constant bus). However, it is still required to abide by the 1 SGPR + // rule. Apply a fix here as we might have multiple SGPRs after + // legalizing VGPRs to SGPRs + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); + + // Check to see if the instruction violates the 1 SGPR rule + if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && + Src0.getReg() != AMDGPU::M0) && + (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && + Src1.getReg() != AMDGPU::M0)) { + + // Check for trivially easy constant prop into one of the operands + // If this is the case then perform the operation now to resolve SGPR + // issue. If we don't do that here we will always insert a mov to m0 + // that can't be resolved in later operand folding pass + bool Resolved = false; + for (MachineOperand *MO : {&Src0, &Src1}) { + if (Register::isVirtualRegister(MO->getReg())) { + MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); + if (DefMI && TII->isFoldableCopy(*DefMI)) { + const MachineOperand &Def = DefMI->getOperand(0); + if (Def.isReg() && + MO->getReg() == Def.getReg() && + MO->getSubReg() == Def.getSubReg()) { + const MachineOperand &Copied = DefMI->getOperand(1); + if (Copied.isImm() && + TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { + MO->ChangeToImmediate(Copied.getImm()); + Resolved = true; + break; + } + } + } + } + } + + if (!Resolved) { + // Haven't managed to resolve by replacing an SGPR with an immediate + // Move src1 to be in M0 + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::COPY), AMDGPU::M0) + .add(Src1); + Src1.ChangeToRegister(AMDGPU::M0, false); + } + } + break; + } } } } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII); + hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); return true; } + +void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { + unsigned numVGPRUses = 0; + bool AllAGPRUses = true; + SetVector<const MachineInstr *> worklist; + SmallSet<const MachineInstr *, 4> Visited; + worklist.insert(&MI); + Visited.insert(&MI); + while (!worklist.empty()) { + const MachineInstr *Instr = worklist.pop_back_val(); + unsigned Reg = Instr->getOperand(0).getReg(); + for (const auto &Use : MRI->use_operands(Reg)) { + const MachineInstr *UseMI = Use.getParent(); + AllAGPRUses &= (UseMI->isCopy() && + TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || + TRI->isAGPR(*MRI, Use.getReg()); + if (UseMI->isCopy() || UseMI->isRegSequence()) { + if (UseMI->isCopy() && + UseMI->getOperand(0).getReg().isPhysical() && + !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { + numVGPRUses++; + } + if (Visited.insert(UseMI).second) + worklist.insert(UseMI); + + continue; + } + + if (UseMI->isPHI()) { + const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); + if (!TRI->isSGPRReg(*MRI, Use.getReg()) && + UseRC != &AMDGPU::VReg_1RegClass) + numVGPRUses++; + continue; + } + + const TargetRegisterClass *OpRC = + TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); + if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && + OpRC != &AMDGPU::VS_64RegClass) { + numVGPRUses++; + } + } + } + + Register PHIRes = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); + if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { + LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); + MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); + } + + bool hasVGPRInput = false; + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + unsigned InputReg = MI.getOperand(i).getReg(); + MachineInstr *Def = MRI->getVRegDef(InputReg); + if (TRI->isVectorRegister(*MRI, InputReg)) { + if (Def->isCopy()) { + unsigned SrcReg = Def->getOperand(1).getReg(); + const TargetRegisterClass *RC = + TRI->getRegClassForReg(*MRI, SrcReg); + if (TRI->isSGPRClass(RC)) + continue; + } + hasVGPRInput = true; + break; + } + else if (Def->isCopy() && + TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { + hasVGPRInput = true; + break; + } + } + + if ((!TRI->isVectorRegister(*MRI, PHIRes) && + RC0 != &AMDGPU::VReg_1RegClass) && + (hasVGPRInput || numVGPRUses > 1)) { + LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); + TII->moveToVALU(MI); + } + else { + LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); + TII->legalizeOperands(MI, MDT); + } + +} diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp index 5b834c8de13a..a0119297b112 100644 --- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp +++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -91,8 +91,7 @@ static bool findSRegBaseAndIndex(MachineOperand *Op, Worklist.push_back(Op); while (!Worklist.empty()) { MachineOperand *WOp = Worklist.pop_back_val(); - if (!WOp->isReg() || - !TargetRegisterInfo::isVirtualRegister(WOp->getReg())) + if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg())) continue; MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); switch (DefInst->getOpcode()) { diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 74d77d328019..4eac03168760 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, switch (Opc) { case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_FMAC_F32_e64: { + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast<int>(OpNo) == Src2Idx) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned Opc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } @@ -235,9 +239,11 @@ static bool updateOperand(FoldCandidate &Fold, if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); - auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); - if (Liveness != MachineBasicBlock::LQR_Dead) + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16); + if (Liveness != MachineBasicBlock::LQR_Dead) { + LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n"); return false; + } MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); int Op32 = Fold.getShrinkOpcode(); @@ -248,7 +254,7 @@ static bool updateOperand(FoldCandidate &Fold, bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); - unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); + Register NewReg0 = MRI.createVirtualRegister(Dst0RC); MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); @@ -314,12 +320,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64) && + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned NewOpc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. @@ -435,7 +444,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) return false; - if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && + TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); return true; } @@ -443,8 +453,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, if (!OpToFold.isReg()) return false; - unsigned UseReg = OpToFold.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + Register UseReg = OpToFold.getReg(); + if (!Register::isVirtualRegister(UseReg)) return false; if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { @@ -481,6 +491,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, return false; // Can only fold splat constants } + if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) + return false; + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); return true; } @@ -518,7 +531,7 @@ void SIFoldOperands::foldOperand( // REG_SEQUENCE instructions, so we have to fold them into the // uses of REG_SEQUENCE. if (UseMI->isRegSequence()) { - unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + Register RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); MachineRegisterInfo::use_iterator Next; @@ -569,15 +582,18 @@ void SIFoldOperands::foldOperand( OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImmLike && UseMI->isCopy()) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI->getRegClass(DestReg) : - TRI->getPhysRegClass(DestReg); + Register DestReg = UseMI->getOperand(0).getReg(); + + // Don't fold into a copy to a physical register. Doing so would interfere + // with the register coalescer's logic which would avoid redundant + // initalizations. + if (DestReg.isPhysical()) + return; + + const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg); - unsigned SrcReg = UseMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register SrcReg = UseMI->getOperand(1).getReg(); + if (SrcReg.isVirtual()) { // XXX - This can be an assert? const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; @@ -613,10 +629,17 @@ void SIFoldOperands::foldOperand( return; UseMI->setDesc(TII->get(MovOp)); + MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); + MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); + while (ImpOpI != ImpOpE) { + MachineInstr::mop_iterator Tmp = ImpOpI; + ImpOpI++; + UseMI->RemoveOperand(UseMI->getOperandNo(Tmp)); + } CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && - TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && + Register::isVirtualRegister(UseMI->getOperand(0).getReg()) && TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { @@ -677,6 +700,9 @@ void SIFoldOperands::foldOperand( // => // %sgpr1 = COPY %sgpr0 UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->getOperand(1).setReg(OpToFold.getReg()); + UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); + UseMI->getOperand(1).setIsKill(false); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; } @@ -708,7 +734,7 @@ void SIFoldOperands::foldOperand( // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { - unsigned UseReg = UseOp.getReg(); + Register UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) @@ -810,7 +836,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, if (Op.isReg()) { // If this has a subregister, it obviously is a register source. if (Op.getSubReg() != AMDGPU::NoSubRegister || - !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + !Register::isVirtualRegister(Op.getReg())) return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); @@ -1073,6 +1099,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { + if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) { + Register Reg = Fold.OpToFold->getReg(); + MachineInstr *DefMI = Fold.OpToFold->getParent(); + if (DefMI->readsRegister(AMDGPU::EXEC, TRI) && + execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI)) + continue; + } if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { @@ -1316,6 +1349,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; + + MachineOperand *CurrentKnownM0Val = nullptr; for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; @@ -1328,6 +1363,25 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI)) tryFoldClamp(MI); + + // Saw an unknown clobber of m0, so we no longer know what it is. + if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) + CurrentKnownM0Val = nullptr; + continue; + } + + // Specially track simple redefs of m0 to the same value in a block, so we + // can erase the later ones. + if (MI.getOperand(0).getReg() == AMDGPU::M0) { + MachineOperand &NewM0Val = MI.getOperand(1); + if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) { + MI.eraseFromParent(); + continue; + } + + // We aren't tracking other physical registers + CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ? + nullptr : &NewM0Val; continue; } @@ -1339,8 +1393,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!FoldingImm && !OpToFold.isReg()) continue; - if (OpToFold.isReg() && - !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) + if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg())) continue; // Prevent folding operands backwards in the function. For example, @@ -1350,8 +1403,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // ... // %vgpr0 = V_MOV_B32_e32 1, implicit %exec MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg())) continue; foldInstOperand(MI, OpToFold); diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index f3c9ad63a80a..26bae5734df7 100644 --- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -120,7 +120,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { return false; // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. for (const MachineOperand &ResMO : MI.defs()) { - unsigned ResReg = ResMO.getReg(); + Register ResReg = ResMO.getReg(); for (const MachineOperand &MO : MI.uses()) { if (!MO.isReg() || MO.isDef()) continue; @@ -144,7 +144,7 @@ static unsigned getMopState(const MachineOperand &MO) { S |= RegState::Kill; if (MO.isEarlyClobber()) S |= RegState::EarlyClobber; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable()) + if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable()) S |= RegState::Renamable; return S; } @@ -152,7 +152,7 @@ static unsigned getMopState(const MachineOperand &MO) { template <typename Callable> void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const { - if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) || + if (LaneMask.all() || Register::isPhysicalRegister(Reg) || LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) { Func(0); return; @@ -216,7 +216,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // If it is tied we will need to write same register as we read. if (MO.isTied()) @@ -227,7 +227,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, if (Conflict == Map.end()) continue; - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return false; LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); @@ -265,13 +265,13 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI, for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg) continue; - LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ? - TRI->getSubRegIndexLaneMask(MO.getSubReg()) : - LaneBitmask::getAll(); + LaneBitmask Mask = Register::isVirtualRegister(Reg) + ? TRI->getSubRegIndexLaneMask(MO.getSubReg()) + : LaneBitmask::getAll(); RegUse &Map = MO.isDef() ? Defs : Uses; auto Loc = Map.find(Reg); @@ -389,7 +389,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { for (auto &&R : Defs) { unsigned Reg = R.first; Uses.erase(Reg); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) continue; LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); @@ -397,7 +397,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { for (auto &&R : Uses) { unsigned Reg = R.first; - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) continue; LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index feab6bed2603..ed07ed100a19 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -202,15 +206,15 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - unsigned FlatScratchInitReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + Register FlatScratchInitReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(FlatScratchInitReg); MBB.addLiveIn(FlatScratchInitReg); - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); @@ -424,8 +428,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); // We need to insert initialization of the scratch resource descriptor. - unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdHsaOrMesa(F)) { @@ -539,9 +543,9 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either // the amdgpu-git-ptr-high function attribute or the top part of the PC - unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -601,14 +605,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); if (MFI->hasImplicitBufferPtr()) { - unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); @@ -640,8 +644,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); } } else { - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); BuildMI(MBB, I, DL, SMovB32, Rsrc0) .addExternalSymbol("SCRATCH_RSRC_DWORD0") @@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: return true; + case TargetStackID::SVEVector: + return false; } llvm_unreachable("Invalid TargetStackID::Value"); } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index c644f4726e2c..d9970fd6b4b8 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -20,9 +20,9 @@ class GCNSubtarget; class SIFrameLowering final : public AMDGPUFrameLowering { public: - SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1) : - AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + SIFrameLowering(StackDirection D, Align StackAl, int LAO, + Align TransAl = Align::None()) + : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override = default; void emitEntryFunctionPrologue(MachineFunction &MF, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index db0782e2bf3e..56ebf9c06741 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,11 +20,11 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -35,6 +35,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" @@ -44,6 +45,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" @@ -115,7 +117,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); @@ -125,10 +127,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); - addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); - addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); @@ -141,12 +143,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); // Unless there are also VOP3P operations, not operations are really legal. - addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); - addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } @@ -178,6 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v32i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); @@ -215,31 +218,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); - - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -653,6 +635,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMA, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); @@ -687,6 +670,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::ADDCARRY); setTargetDAGCombine(ISD::SUB); @@ -768,19 +778,22 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // TODO: Consider splitting all arguments into 32-bit pieces. - if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + if (CC == CallingConv::AMDGPU_KERNEL) + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); + + if (VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 32) return ScalarVT.getSimpleVT(); - if (Size == 64) + if (Size > 32) return MVT::i32; if (Size == 16 && Subtarget->has16BitInsts()) return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - } + } else if (VT.getSizeInBits() > 32) + return MVT::i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -788,7 +801,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + if (CC == CallingConv::AMDGPU_KERNEL) + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); + + if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); @@ -796,12 +812,13 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, if (Size == 32) return NumElts; - if (Size == 64) - return 2 * NumElts; + if (Size > 32) + return NumElts * ((Size + 31) / 32); if (Size == 16 && Subtarget->has16BitInsts()) - return (VT.getVectorNumElements() + 1) / 2; - } + return (NumElts + 1) / 2; + } else if (VT.getSizeInBits() > 32) + return (VT.getSizeInBits() + 31) / 32; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -821,10 +838,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } - if (Size == 64) { + if (Size > 32) { RegisterVT = MVT::i32; IntermediateVT = RegisterVT; - NumIntermediates = 2 * NumElts; + NumIntermediates = NumElts * ((Size + 31) / 32); return NumIntermediates; } @@ -901,7 +918,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = MFI->getImagePSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), CI.getArgOperand(RsrcIntr->RsrcArg)); - Info.align = 0; + Info.align.reset(); } else { Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), @@ -947,7 +964,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); @@ -964,7 +981,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), CI.getArgOperand(1)); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); @@ -978,7 +995,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getOperand(0)->getType() ->getPointerElementType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; @@ -988,7 +1005,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); @@ -1012,7 +1029,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // This is an abstract access, but we need to specify a type and size. Info.memVT = MVT::i32; Info.size = 4; - Info.align = 4; + Info.align = Align(4); Info.flags = MachineMemOperand::MOStore; if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) @@ -1215,21 +1232,12 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( + unsigned Size, unsigned AddrSpace, unsigned Align, + MachineMemOperand::Flags Flags, bool *IsFast) const { if (IsFast) *IsFast = false; - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, - // which isn't a simple VT. - // Until MVT is extended to handle this, simply check for the size and - // rely on the condition below: allow accesses if the size is a multiple of 4. - if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && - VT.getStoreSize() > 16)) { - return false; - } - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte @@ -1268,7 +1276,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } // Smaller than dword value must be aligned. - if (VT.bitsLT(MVT::i32)) + if (Size < 32) return false; // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the @@ -1277,7 +1285,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32) && Align % 4 == 0; + return Size >= 32 && Align >= 4; +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + // Until MVT is extended to handle this, simply check for the size and + // rely on the condition below: allow accesses if the size is a multiple of 4. + if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && + VT.getStoreSize() > 16)) { + return false; + } + + return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, + Align, Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -1336,9 +1363,9 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const { TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(MVT VT) const { - if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) - return TypeSplitVector; - + int NumElts = VT.getVectorNumElements(); + if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -1562,7 +1589,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, // entire split argument. if (Arg->Flags.isSplit()) { while (!Arg->Flags.isSplitEnd()) { - assert(!Arg->VT.isVector() && + assert((!Arg->VT.isVector() || + Arg->VT.getScalarSizeInBits() == 16) && "unexpected vector split in ps argument type"); if (!SkipArg) Splits.push_back(*Arg); @@ -1589,29 +1617,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, } // Allocate special inputs passed in VGPRs. -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (Info.hasWorkItemIDX()) { - unsigned Reg = AMDGPU::VGPR0; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR0; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDY()) { - unsigned Reg = AMDGPU::VGPR1; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDZ()) { - unsigned Reg = AMDGPU::VGPR2; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); @@ -1642,7 +1673,8 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, assert(Reg != AMDGPU::NoRegister); MachineFunction &MF = CCInfo.getMachineFunction(); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); return ArgDescriptor::createRegister(Reg, Mask); } @@ -1671,10 +1703,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -static void allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { const unsigned Mask = 0x3ff; ArgDescriptor Arg; @@ -1692,10 +1724,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } -static void allocateSpecialInputSGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputSGPRs( + CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); // TODO: Unify handling with private memory pointers. @@ -1728,10 +1761,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo, } // Allocate special inputs passed in user SGPRs. -static void allocateHSAUserSGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { if (Info.hasImplicitBufferPtr()) { unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); @@ -1758,9 +1791,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } if (Info.hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); - MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register InputPtrReg = Info.addKernargSegmentPtr(TRI); CCInfo.AllocateReg(InputPtrReg); + + Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); + MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); } if (Info.hasDispatchID()) { @@ -1780,32 +1816,32 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } // Allocate special input registers that are initialized per-wave. -static void allocateSystemSGPRs(CCState &CCInfo, - MachineFunction &MF, - SIMachineFunctionInfo &Info, - CallingConv::ID CallConv, - bool IsShader) { +void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) const { if (Info.hasWorkGroupIDX()) { unsigned Reg = Info.addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDY()) { unsigned Reg = Info.addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDZ()) { unsigned Reg = Info.addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupInfo()) { unsigned Reg = Info.addWorkGroupInfo(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } @@ -1860,7 +1896,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = + Register PrivateSegmentBufferReg = Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { @@ -1921,7 +1957,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // // FIXME: Should not do this if inline asm is reading/writing these // registers. - unsigned PreloadedSP = Info.getPreloadedReg( + Register PreloadedSP = Info.getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setStackPtrOffsetReg(PreloadedSP); @@ -1971,7 +2007,7 @@ void SITargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) @@ -2134,7 +2170,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert(VA.isRegLoc() && "Parameter must be in a register!"); - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); EVT ValVT = VA.getValVT(); @@ -2652,6 +2688,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsThisReturn = false; MachineFunction &MF = DAG.getMachineFunction(); + if (Callee.isUndef() || isNullConstant(Callee)) { + if (!CLI.IsTailCall) { + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + } + + return Chain; + } + if (IsVarArg) { return lowerUnhandledCall(CLI, InVals, "unsupported call to variadic function "); @@ -2782,7 +2827,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, int32_t Offset = LocMemOffset; SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); - unsigned Align = 0; + MaybeAlign Alignment; if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; @@ -2790,8 +2835,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Flags.getByValSize() : VA.getValVT().getStoreSize(); // FIXME: We can have better than the minimum byval required alignment. - Align = Flags.isByVal() ? Flags.getByValAlign() : - MinAlign(Subtarget->getStackAlignment(), Offset); + Alignment = + Flags.isByVal() + ? MaybeAlign(Flags.getByValAlign()) + : commonAlignment(Subtarget->getStackAlignment(), Offset); Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); @@ -2810,7 +2857,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } else { DstAddr = PtrOff; DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); - Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset); + Alignment = + commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2825,7 +2873,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MemOpChains.push_back(Cpy); } else { - SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, + Alignment ? Alignment->value() : 0); MemOpChains.push_back(Store); } } @@ -2937,9 +2986,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, IsThisReturn ? OutVals[0] : SDValue()); } -unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = StringSwitch<unsigned>(RegName) +Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg = StringSwitch<Register>(RegName) .Case("m0", AMDGPU::M0) .Case("exec", AMDGPU::EXEC) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -2947,7 +2996,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) - .Default(AMDGPU::NoRegister); + .Default(Register()); if (Reg == AMDGPU::NoRegister) { report_fatal_error(Twine("invalid register name \"" @@ -3055,6 +3104,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { return std::make_pair(LoopBB, RemainderBB); } +/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. +void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + auto I = MI.getIterator(); + auto E = std::next(I); + + BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + MIBundleBuilder Bundler(*MBB, I, E); + finalizeBundle(*MBB, Bundler.begin()); +} + MachineBasicBlock * SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3066,12 +3129,13 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *RemainderBB; const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + // Apparently kill flags are only valid if the def is in the same block? + if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) + Src->setIsKill(false); std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); MachineBasicBlock::iterator I = LoopBB->end(); - MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); @@ -3081,23 +3145,9 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, .addImm(0) .addImm(EncodedReg); - // This is a pain, but we're not allowed to have physical register live-ins - // yet. Insert a pair of copies if the VGPR0 hack is necessary. - if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { - unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) - .add(*Src); + bundleInstWithWaitcnt(MI); - BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) - .addReg(Data0); - - MRI.setSimpleHint(Data0, Src->getReg()); - } - - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); - - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); // Load and check TRAP_STS.MEM_VIOL BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) @@ -3138,10 +3188,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( MachineBasicBlock::iterator I = LoopBB.begin(); const TargetRegisterClass *BoolRC = TRI->getBoolRC(); - unsigned PhiExec = MRI.createVirtualRegister(BoolRC); - unsigned NewExec = MRI.createVirtualRegister(BoolRC); - unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(BoolRC); + Register PhiExec = MRI.createVirtualRegister(BoolRC); + Register NewExec = MRI.createVirtualRegister(BoolRC); + Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) .addReg(InitReg) @@ -3240,9 +3290,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock::iterator I(&MI); const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); - unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC); + Register DstReg = MI.getOperand(0).getReg(); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; @@ -3315,7 +3365,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, SetOn->getOperand(3).setIsUndef(); } else { - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) .add(*Idx) .addImm(Offset); @@ -3351,8 +3401,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); @@ -3390,8 +3440,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); @@ -3442,7 +3492,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Dst = MI.getOperand(0).getReg(); + Register Dst = MI.getOperand(0).getReg(); const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); @@ -3505,7 +3555,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); - unsigned PhiReg = MRI.createVirtualRegister(VecRC); + Register PhiReg = MRI.createVirtualRegister(VecRC); auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, UseGPRIdxMode, false); @@ -3564,22 +3614,22 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); - unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, Src0, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, Src0, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, Src1, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, Src1, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -3632,8 +3682,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // S_CMOV_B64 exec, -1 MachineInstr *FirstMI = &*BB->begin(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned InputReg = MI.getOperand(0).getReg(); - unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register InputReg = MI.getOperand(0).getReg(); + Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); bool Found = false; // Move the COPY of the input reg to the beginning, so that we can use it. @@ -3707,16 +3757,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src0 = MI.getOperand(1).getReg(); - unsigned Src1 = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned SrcCond = MI.getOperand(3).getReg(); + Register SrcCond = MI.getOperand(3).getReg(); - unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC); + Register SrcCondCopy = MRI.createVirtualRegister(CondRC); BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); @@ -3814,8 +3864,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: case AMDGPU::DS_GWS_BARRIER: - if (getSubtarget()->hasGWSAutoReplay()) + // A s_waitcnt 0 is required to be the instruction immediately following. + if (getSubtarget()->hasGWSAutoReplay()) { + bundleInstWithWaitcnt(MI); return BB; + } + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); @@ -3939,6 +3993,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } +SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + SDValue Lo2, Hi2; + std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3991,6 +4069,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FMA: + return splitTernaryVectorOp(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -4070,6 +4150,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } +SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, + SelectionDAG &DAG, + ArrayRef<SDValue> Ops) const { + SDLoc DL(M); + EVT LoadVT = M->getValueType(0); + EVT EltType = LoadVT.getScalarType(); + EVT IntVT = LoadVT.changeTypeToInteger(); + + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD; + + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + } + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + if (isTypeLegal(LoadVT)) { + return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); + } + + EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); + SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); + SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, + M->getMemOperand(), DAG); + return DAG.getMergeValues( + {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, + DL); +} + static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -4196,8 +4311,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { - Results.push_back(Res); - Results.push_back(Res.getValue(1)); + if (Res.getOpcode() == ISD::MERGE_VALUES) { + // FIXME: Hacky + Results.push_back(Res.getOperand(0)); + Results.push_back(Res.getOperand(1)); + } else { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } return; } @@ -4935,11 +5056,8 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - unsigned LoFlags = GAFlags; - if (LoFlags == SIInstrInfo::MO_NONE) - LoFlags = SIInstrInfo::MO_REL32; SDValue PtrLo = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags); + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); SDValue PtrHi; if (GAFlags == SIInstrInfo::MO_NONE) { PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); @@ -5563,14 +5681,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); SDValue Ops[] = { - DAG.getEntryNode(), // Chain - Rsrc, // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - {}, // voffset - {}, // soffset - {}, // offset - DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; // Use the alignment to ensure that the required offsets will fit into the @@ -5579,7 +5697,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { - Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, LoadVT, MMO)); } @@ -5758,45 +5876,31 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - case Intrinsic::amdgcn_interp_mov: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p1: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p2: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = SDValue(M0.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), - Glue); - } case Intrinsic::amdgcn_interp_p1_f16: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = M0.getValue(1); + SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, + Op.getOperand(5), SDValue()); if (getSubtarget()->getLDSBankCount() == 16) { // 16 bank LDS - SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - Glue); + + // FIXME: This implicitly will insert a second CopyToReg to M0. + SDValue S = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32, + DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Op.getOperand(5)); // m0 + SDValue Ops[] = { Op.getOperand(1), // Src0 Op.getOperand(2), // Attrchan Op.getOperand(3), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers S, // Src2 - holds two f16 values selected by high - DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers Op.getOperand(4), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - DAG.getConstant(0, DL, MVT::i32) // $omod + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + DAG.getTargetConstant(0, DL, MVT::i32) // $omod }; return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); } else { @@ -5805,28 +5909,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), // Src0 Op.getOperand(2), // Attrchan Op.getOperand(3), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers Op.getOperand(4), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - DAG.getConstant(0, DL, MVT::i32), // $omod - Glue + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + DAG.getTargetConstant(0, DL, MVT::i32), // $omod + ToM0.getValue(1) }; return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); } } case Intrinsic::amdgcn_interp_p2_f16: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); - SDValue Glue = SDValue(M0.getNode(), 1); + SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, + Op.getOperand(6), SDValue()); SDValue Ops[] = { Op.getOperand(2), // Src0 Op.getOperand(3), // Attrchan Op.getOperand(4), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers Op.getOperand(1), // Src2 - DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers Op.getOperand(5), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - Glue + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + ToM0.getValue(1) }; return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); } @@ -5947,16 +6051,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } - case Intrinsic::amdgcn_wqm: { - SDValue Src = Op.getOperand(1); - return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), - 0); - } - case Intrinsic::amdgcn_wwm: { - SDValue Src = Op.getOperand(1); - return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), - 0); - } case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -5977,6 +6071,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + SDLoc SL(Op); + unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ? + AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; + SDValue Aperture = getSegmentAperture(AS, SL, DAG); + SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, + Op.getOperand(1)); + + SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, + DAG.getConstant(1, SL, MVT::i32)); + return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5986,6 +6093,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// This function computes an appropriate offset to pass to +// MachineMemOperand::setOffset() based on the offset inputs to +// an intrinsic. If any of the offsets are non-contstant or +// if VIndex is non-zero then this function returns 0. Otherwise, +// it returns the sum of VOffset, SOffset, and Offset. +static unsigned getBufferOffsetForMMO(SDValue VOffset, + SDValue SOffset, + SDValue Offset, + SDValue VIndex = SDValue()) { + + if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) || + !isa<ConstantSDNode>(Offset)) + return 0; + + if (VIndex) { + if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue()) + return 0; + } + + return cast<ConstantSDNode>(VOffset)->getSExtValue() + + cast<ConstantSDNode>(SOffset)->getSExtValue() + + cast<ConstantSDNode>(Offset)->getSExtValue(); +} + SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -6128,17 +6259,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -6155,6 +6291,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6163,32 +6301,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(4), // soffset Offsets.second, // offset - Op.getOperand(5), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(5), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5])); + return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6197,29 +6321,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5], + Ops[2])); + return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -6239,9 +6348,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // voffset Op.getOperand(5), // soffset Op.getOperand(6), // offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6264,8 +6373,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // soffset Offsets.second, // offset Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6288,8 +6397,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6321,13 +6430,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = 0; switch (IntrID) { @@ -6377,7 +6490,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_or: - case Intrinsic::amdgcn_raw_buffer_atomic_xor: { + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6388,11 +6503,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); unsigned Opcode = 0; switch (IntrID) { @@ -6426,6 +6542,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6442,7 +6564,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_or: - case Intrinsic::amdgcn_struct_buffer_atomic_xor: { + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6453,11 +6577,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); unsigned Opcode = 0; switch (IntrID) { @@ -6491,6 +6617,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6512,12 +6644,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6534,10 +6670,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6554,10 +6691,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(7), // soffset Offsets.second, // offset Op.getOperand(8), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7], + Ops[4])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6686,23 +6825,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } - case Intrinsic::amdgcn_s_sendmsg: - case Intrinsic::amdgcn_s_sendmsghalt: { - unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? - AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(NodeOp, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); - } - case Intrinsic::amdgcn_init_exec: { - return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, - Op.getOperand(2)); - } - case Intrinsic::amdgcn_init_exec_from_input: { - return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, - Op.getOperand(2), Op.getOperand(3)); - } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -6733,9 +6855,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // voffset Op.getOperand(6), // soffset Op.getOperand(7), // offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idexen + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6759,8 +6881,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idexen + Op.getOperand(8), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6784,8 +6906,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idexen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6813,14 +6935,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -6833,10 +6959,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Chain, @@ -6846,18 +6984,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); @@ -6865,10 +7003,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Chain, @@ -6878,17 +7029,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(6), // soffset Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, @@ -6908,13 +7061,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getOperand(2).getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD : AMDGPUISD::BUFFER_ATOMIC_FADD; @@ -6987,7 +7144,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( Overflow += ImmOffset; ImmOffset = 0; } - C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32)); + C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); if (Overflow) { auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); if (!N0) @@ -7001,14 +7158,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( if (!N0) N0 = DAG.getConstant(0, DL, MVT::i32); if (!C1) - C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32)); + C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); return {N0, SDValue(C1, 0)}; } // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. -void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, +unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align) const { SDLoc DL(CombinedOffset); @@ -7018,8 +7175,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); - Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return SOffset + ImmOffset; } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { @@ -7031,13 +7188,14 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Subtarget, Align)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); - Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return 0; } } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); - Offsets[2] = DAG.getConstant(0, DL, MVT::i32); + Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); + return 0; } // Handle 8 bit and 16 bit buffer loads @@ -7053,9 +7211,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, M->getMemOperand()); - SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, - LoadVT.getScalarType(), BufferLoad); - return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); + SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); + LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); + + return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); } // Handle 8 bit and 16 bit buffer stores @@ -7063,6 +7222,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { + if (VDataType == MVT::f16) + Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); Ops[1] = BufferStoreExt; unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : @@ -7215,8 +7377,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *Load->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -7505,6 +7667,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); } +// Returns immediate value for setting the F32 denorm mode when using the +// S_DENORM_MODE instruction. +static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, + const SDLoc &SL, const GCNSubtarget *ST) { + assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); + int DPDenormModeDefault = ST->hasFP64Denormals() + ? FP_DENORM_FLUSH_NONE + : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + int Mode = SPDenormMode | (DPDenormModeDefault << 2); + return DAG.getTargetConstant(Mode, SL, MVT::i32); +} + SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; @@ -7531,16 +7706,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); if (!Subtarget->hasFP32Denormals()) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, - SL, MVT::i32); - SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), - EnableDenormValue, BitField); + + SDValue EnableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue EnableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); + + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue); + } else { + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue, + BitField); + } + SDValue Ops[3] = { NegDivScale0, EnableDenorm.getValue(0), @@ -7562,19 +7747,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled, Mul); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); if (!Subtarget->hasFP32Denormals()) { - const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), - DisableDenormValue, - BitField, - Fma4.getValue(2)); + + SDValue DisableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue DisableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); + + DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + Fma4.getValue(2)); + } else { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + + DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + BitField, Fma4.getValue(2)); + } SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot()); @@ -7684,8 +7879,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *Store->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) { return expandUnalignedStore(Store, DAG); } @@ -10065,7 +10260,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have // to try understanding copies to physical registers. if (SrcVal.getValueType() == MVT::i1 && - TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) { + Register::isPhysicalRegister(DestReg->getReg())) { SDLoc SL(Node); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue VReg = DAG.getRegister( @@ -10218,7 +10413,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineOperand &Op = MI.getOperand(I); if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !Register::isVirtualRegister(Op.getReg()) || !TRI->isAGPR(MRI, Op.getReg())) continue; auto *Src = MRI.getUniqueVRegDef(Op.getReg()); @@ -10256,7 +10451,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, Node->use_begin()->isMachineOpcode() && Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && !Node->use_begin()->hasAnyUseOfValue(0))) { - unsigned Def = MI.getOperand(0).getReg(); + Register Def = MI.getOperand(0).getReg(); // Change this into a noret atomic. MI.setDesc(TII->get(NoRetAtomicOp)); @@ -10300,7 +10495,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, // Combine the constants and the pointer. const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr, DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, @@ -10330,7 +10525,7 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), PtrLo, DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), PtrHi, @@ -10364,7 +10559,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, nullptr); case 32: case 16: - RC = &AMDGPU::SReg_32_XM0RegClass; + RC = &AMDGPU::SReg_32RegClass; break; case 64: RC = &AMDGPU::SGPR_64RegClass; @@ -10373,7 +10568,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::SReg_96RegClass; break; case 128: - RC = &AMDGPU::SReg_128RegClass; + RC = &AMDGPU::SGPR_128RegClass; break; case 160: RC = &AMDGPU::SReg_160RegClass; @@ -10415,6 +10610,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } break; case 'a': + if (!Subtarget->hasMAIInsts()) + break; switch (VT.getSizeInBits()) { default: return std::make_pair(0U, nullptr); @@ -10548,9 +10745,9 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } -unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { - const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML); - const unsigned CacheLineAlign = 6; // log2(64) +Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); + const Align CacheLineAlign = Align(64); // Pre-GFX10 target did not benefit from loop alignment if (!ML || DisableLoopAlignment || @@ -10578,7 +10775,7 @@ unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // If inner loop block is aligned assume in average half of the alignment // size to be added as nops. if (MBB != Header) - LoopSize += (1 << MBB->getAlignment()) / 2; + LoopSize += MBB->getAlignment().value() / 2; for (const MachineInstr &MI : *MBB) { LoopSize += TII->getInstSizeInBytes(MI); @@ -10644,7 +10841,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, const MachineRegisterInfo &MRI = MF->getRegInfo(); const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return !TRI.isSGPRReg(MRI, Reg); if (MRI.isLiveIn(Reg)) { @@ -10683,12 +10880,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence( cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); - // In some cases intrinsics that are a source of divergence have been - // lowered to AMDGPUISD so we also need to check those too. - case AMDGPUISD::INTERP_MOV: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: - return true; } return false; } @@ -10748,3 +10939,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } + +const TargetRegisterClass * +SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) + return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass + : &AMDGPU::SReg_32RegClass; + if (!TRI->isSGPRClass(RC) && !isDivergent) + return TRI->getEquivalentSGPRClass(RC); + else if (TRI->isSGPRClass(RC) && isDivergent) + return TRI->getEquivalentVGPRClass(RC); + + return RC; +} + +static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { + if (!Visited.insert(V).second) + return false; + bool Result = false; + for (auto U : V->users()) { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { + if (V == U->getOperand(1)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_if_break: + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: + Result = true; + break; + } + } + if (V == U->getOperand(0)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_loop: + Result = true; + break; + } + } + } else { + Result = hasCFUser(U, Visited); + } + if (Result) + break; + } + return Result; +} + +bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, + const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if_break: + return true; + } + } + if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef<unsigned> Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) { + return true; + } + } + } + } + } + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (isa<InlineAsm>(CI->getCalledValue())) { + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( + MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + for (auto &TC : TargetConstraints) { + if (TC.Type == InlineAsm::isOutput) { + ComputeConstraintToUse(TC, SDValue()); + unsigned AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, TC.ConstraintVT); + if (RC) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) + return true; + else if (SIRI->isSGPRClass(RC)) + return true; + } + } + } + } + } + SmallPtrSet<const Value *, 16> Visited; + return hasCFUser(V, Visited); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 21a215e16ce7..f0102feb65c4 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -94,6 +94,9 @@ private: SelectionDAG &DAG, ArrayRef<SDValue> Ops, bool IsIntrinsic = false) const; + SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG, + ArrayRef<SDValue> Ops) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to // dwordx4 if on SI. SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, @@ -183,6 +186,7 @@ private: unsigned isCFIntrinsic(const SDNode *Intr) const; +public: /// \returns True if fixup needs to be emitted for given global value \p GV, /// false otherwise. bool shouldEmitFixup(const GlobalValue *GV) const; @@ -195,11 +199,14 @@ private: /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; +private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. - void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, unsigned Align = 4) const; + /// \returns 0 If there is a non-constant offset or if the offset is 0. + /// Otherwise returns the constant offset. + unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, + SDValue *Offsets, unsigned Align = 4) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, @@ -235,6 +242,11 @@ public: bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; + bool allowsMisalignedMemoryAccessesImpl( + unsigned Size, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *IsFast = nullptr) const; + bool allowsMisalignedMemoryAccesses( EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, @@ -309,12 +321,13 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + void bundleInstWithWaitcnt(MachineInstr &MI) const; MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const; @@ -330,6 +343,7 @@ public: bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -374,7 +388,37 @@ public: unsigned Depth = 0) const override; AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + virtual const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent) const override; + virtual bool requiresUniformRegister(MachineFunction &MF, + const Value *V) const override; + Align getPrefLoopAlignment(MachineLoop *ML) const override; + + void allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) const; + + void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + void allocateSpecialInputSGPRs( + CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c89d5b71ec5c..dcb04e426584 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1483,12 +1483,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (BI.Incoming) { if (!Brackets) - Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming); + Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); else *Brackets = *BI.Incoming; } else { if (!Brackets) - Brackets = llvm::make_unique<WaitcntBrackets>(ST); + Brackets = std::make_unique<WaitcntBrackets>(ST); else Brackets->clear(); } @@ -1508,7 +1508,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (!MoveBracketsToSucc) { MoveBracketsToSucc = &SuccBI; } else { - SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets); + SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); } } else if (SuccBI.Incoming->merge(*Brackets)) { SuccBI.Dirty = true; diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 561a16c3e351..4dcbe92861f2 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -124,6 +124,9 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that this is one of MFMA instructions. field bit IsMAI = 0; + // This bit indicates that this is one of DOT instructions. + field bit IsDOT = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -189,6 +192,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{54} = IsMAI; + let TSFlags{55} = IsDOT; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index ba8ed6993a56..d97e6a62971b 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -318,8 +318,25 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, if (isMUBUF(LdSt) || isMTBUF(LdSt)) { const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); - if (SOffset && SOffset->isReg()) - return false; + if (SOffset && SOffset->isReg()) { + // We can only handle this if it's a stack access, as any other resource + // would require reporting multiple base registers. + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (AddrReg && !AddrReg->isFI()) + return false; + + const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); + const SIMachineFunctionInfo *MFI + = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); + if (RSrc->getReg() != MFI->getScratchRSrcReg()) + return false; + + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); + BaseOp = SOffset; + Offset = OffsetImm->getImm(); + return true; + } const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) @@ -458,9 +475,9 @@ bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, const MachineRegisterInfo &MRI = FirstLdSt.getParent()->getParent()->getRegInfo(); - const unsigned Reg = FirstDst->getReg(); + const Register Reg = FirstDst->getReg(); - const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg) + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); @@ -807,7 +824,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, "Not a VGPR32 reg"); if (Cond.size() == 1) { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(Cond[0]); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) @@ -820,7 +837,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, assert(Cond[0].isImm() && "Cond[0] is not an immediate"); switch (Cond[0].getImm()) { case SIInstrInfo::SCC_TRUE: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) @@ -834,7 +851,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::SCC_FALSE: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), SReg) .addImm(0) @@ -850,7 +867,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCNZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) @@ -864,7 +881,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) @@ -876,8 +893,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::EXECNZ: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); - unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); @@ -894,8 +911,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::EXECZ: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); - unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); @@ -925,7 +942,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); + Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -938,7 +955,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); + Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -1052,12 +1069,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { + if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) + BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -1068,11 +1085,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // correctly handled. if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); - if (ST.hasScalarStores()) { - // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); - } - return; } @@ -1083,7 +1095,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); if (RI.hasAGPRs(RC)) { MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MIB.addReg(Tmp, RegState::Define); } MIB.addReg(SrcReg, getKillRegState(isKill)) // data @@ -1182,24 +1194,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { + if (Register::isVirtualRegister(DestReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); - MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) + BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); - - if (ST.hasScalarStores()) { - // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); - } - return; } @@ -1208,7 +1214,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); if (RI.hasAGPRs(RC)) { MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MIB.addReg(Tmp, RegState::Define); } MIB.addFrameIndex(FrameIndex) // vaddr @@ -1242,13 +1248,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); - unsigned TIDIGYReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); - unsigned TIDIGZReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); - unsigned InputPtrReg = + Register TIDIGXReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); + Register TIDIGYReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); + Register TIDIGZReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + Register InputPtrReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) @@ -1410,9 +1416,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + Register Dst = MI.getOperand(0).getReg(); + Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? @@ -1437,6 +1443,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::V_MOV_B64_DPP_PSEUDO: { + expandMovDPP64(MI); + break; + } case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -1469,7 +1479,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_MOVRELD_B32_V8: case AMDGPU::V_MOVRELD_B32_V16: { const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); - unsigned VecReg = MI.getOperand(0).getReg(); + Register VecReg = MI.getOperand(0).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); assert(VecReg == MI.getOperand(1).getReg()); @@ -1492,9 +1502,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case AMDGPU::SI_PC_ADD_REL_OFFSET: { MachineFunction &MF = *MBB.getParent(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + Register Reg = MI.getOperand(0).getReg(); + Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. @@ -1531,7 +1541,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } case TargetOpcode::BUNDLE: { - if (!MI.mayLoad()) + if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) return false; // If it is a load it must be a memory clause @@ -1550,6 +1560,64 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } +std::pair<MachineInstr*, MachineInstr*> +SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { + assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register Dst = MI.getOperand(0).getReg(); + unsigned Part = 0; + MachineInstr *Split[2]; + + + for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { + auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); + if (Dst.isPhysical()) { + MovDPP.addDef(RI.getSubReg(Dst, Sub)); + } else { + assert(MRI.isSSA()); + auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MovDPP.addDef(Tmp); + } + + for (unsigned I = 1; I <= 2; ++I) { // old and src operands. + const MachineOperand &SrcOp = MI.getOperand(I); + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + Imm.ashrInPlace(Part * 32); + MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); + } else { + assert(SrcOp.isReg()); + Register Src = SrcOp.getReg(); + if (Src.isPhysical()) + MovDPP.addReg(RI.getSubReg(Src, Sub)); + else + MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); + } + } + + for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) + MovDPP.addImm(MI.getOperand(I).getImm()); + + Split[Part] = MovDPP; + ++Part; + } + + if (Dst.isVirtual()) + BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) + .addReg(Split[0]->getOperand(0).getReg()) + .addImm(AMDGPU::sub0) + .addReg(Split[1]->getOperand(0).getReg()) + .addImm(AMDGPU::sub1); + + MI.eraseFromParent(); + return std::make_pair(Split[0], Split[1]); +} + bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -1574,7 +1642,7 @@ bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp) { - unsigned Reg = RegOp.getReg(); + Register Reg = RegOp.getReg(); unsigned SubReg = RegOp.getSubReg(); bool IsKill = RegOp.isKill(); bool IsDead = RegOp.isDead(); @@ -1646,7 +1714,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // This needs to be implemented because the source modifiers may be inserted // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, +bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); } @@ -1710,7 +1779,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. - unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); auto I = MBB.end(); @@ -2163,7 +2232,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, SmallVector<unsigned, 8> Regs; for (int Idx = 0; Idx != NElts; ++Idx) { - unsigned DstElt = MRI.createVirtualRegister(EltRC); + Register DstElt = MRI.createVirtualRegister(EltRC); Regs.push_back(DstElt); unsigned SubIdx = SubIndices[Idx]; @@ -2327,7 +2396,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - unsigned Src1Reg = Src1->getReg(); + Register Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); Src0->setReg(Src1Reg); Src0->setSubReg(Src1SubReg); @@ -2367,12 +2436,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MRI->hasOneUse(Src0->getReg())) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); Src0Inlined = true; - } else if ((RI.isPhysicalRegister(Src0->getReg()) && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || - (RI.isVirtualRegister(Src0->getReg()) && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) + } else if ((Register::isPhysicalRegister(Src0->getReg()) && + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || + (Register::isVirtualRegister(Src0->getReg()) && + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) return false; // VGPR is okay as Src0 - fallthrough } @@ -2385,10 +2454,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); - } else if ((RI.isPhysicalRegister(Src1->getReg()) && - RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || - (RI.isVirtualRegister(Src1->getReg()) && - RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + } else if ((Register::isPhysicalRegister(Src1->getReg()) && + RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || + (Register::isVirtualRegister(Src1->getReg()) && + RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) return false; // VGPR is okay as Src1 - fallthrough } @@ -2472,8 +2541,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, } bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA) const { + const MachineInstr &MIb) const { assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); assert((MIb.mayLoad() || MIb.mayStore()) && @@ -2664,6 +2732,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, MI.modifiesRegister(AMDGPU::EXEC, &RI) || MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || + MI.getOpcode() == AMDGPU::S_DENORM_MODE || changesVGPRIndexingMode(MI); } @@ -2865,8 +2934,16 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - if (MO.isImm() && isInlineConstant(MO, OpInfo)) + const MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + + if (MO.isImm() && isInlineConstant(MO, OpInfo)) { + if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && + OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src2)) + return false; return RI.opCanUseInlineConstant(OpInfo.OperandType); + } if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) return false; @@ -2874,8 +2951,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) return true; - const MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); return ST.hasVOP3Literal(); } @@ -3036,7 +3111,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, if (!MO.isUse()) return false; - if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (Register::isVirtualRegister(MO.getReg())) return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); // Null is free @@ -3093,7 +3168,8 @@ static bool shouldReadExec(const MachineInstr &MI) { return true; } - if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || + if (MI.isPreISelOpcode() || + SIInstrInfo::isGenericOpcode(MI.getOpcode()) || SIInstrInfo::isSALU(MI) || SIInstrInfo::isSMRD(MI)) return false; @@ -3104,7 +3180,7 @@ static bool shouldReadExec(const MachineInstr &MI) { static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg) { - if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) + if (Register::isPhysicalRegister(SubReg.getReg())) return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); return SubReg.getSubReg() != AMDGPU::NoSubRegister && @@ -3144,8 +3220,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (!Op.isReg()) continue; - unsigned Reg = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { + Register Reg = Op.getReg(); + if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { ErrInfo = "inlineasm operand has incorrect register class."; return false; } @@ -3209,9 +3285,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, continue; if (RegClass != -1) { - unsigned Reg = MI.getOperand(i).getReg(); - if (Reg == AMDGPU::NoRegister || - TargetRegisterInfo::isVirtualRegister(Reg)) + Register Reg = MI.getOperand(i).getReg(); + if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -3304,7 +3379,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Dst register should be tied to implicit use of preserved register"; return false; - } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) && + } else if (Register::isPhysicalRegister(TiedMO.getReg()) && Dst.getReg() != TiedMO.getReg()) { ErrInfo = "Dst register should use same physical register as preserved"; return false; @@ -3409,6 +3484,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // Special case for writelane - this can break the multiple constant bus rule, + // but still can't use more than one SGPR register + if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { + unsigned SGPRCount = 0; + Register SGPRUsed = AMDGPU::NoRegister; + + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + if (OpIdx == -1) + break; + + const MachineOperand &MO = MI.getOperand(OpIdx); + + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { + if (MO.isReg() && MO.getReg() != AMDGPU::M0) { + if (MO.getReg() != SGPRUsed) + ++SGPRCount; + SGPRUsed = MO.getReg(); + } + } + if (SGPRCount > ST.getConstantBusLimit(Opcode)) { + ErrInfo = "WRITELANE instruction violates constant bus restriction"; + return false; + } + } + } + // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { @@ -3609,7 +3710,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && ST.getGeneration() >= AMDGPUSubtarget::GFX10) { ErrInfo = "Invalid dpp_ctrl value: " - "broadcats are not supported on GFX10+"; + "broadcasts are not supported on GFX10+"; return false; } if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && @@ -3631,6 +3732,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; + case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; case AMDGPU::WWM: return AMDGPU::WWM; case AMDGPU::S_MOV_B32: { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -3708,9 +3810,9 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, const MCInstrDesc &Desc = get(MI.getOpcode()); if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || Desc.OpInfo[OpNo].RegClass == -1) { - unsigned Reg = MI.getOperand(OpNo).getReg(); + Register Reg = MI.getOperand(OpNo).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); return RI.getPhysRegClass(Reg); } @@ -3741,7 +3843,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { else VRC = &AMDGPU::VGPR_32RegClass; - unsigned Reg = MRI.createVirtualRegister(VRC); + Register Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); MO.ChangeToRegister(Reg, false); @@ -3756,7 +3858,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, const { MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned SubReg = MRI.createVirtualRegister(SubRC); + Register SubReg = MRI.createVirtualRegister(SubRC); if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) @@ -3768,7 +3870,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + Register NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); @@ -3814,11 +3916,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, if (!MO.isReg()) return false; - unsigned Reg = MO.getReg(); - const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - RI.getPhysRegClass(Reg); + Register Reg = MO.getReg(); + const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); @@ -3935,13 +4036,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (Opc == AMDGPU::V_WRITELANE_B32) { const DebugLoc &DL = MI.getDebugLoc(); if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src0); Src0.ChangeToRegister(Reg, false); } if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src1); @@ -3967,7 +4068,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // select is uniform. if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src1); @@ -4003,7 +4104,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MI.setDesc(get(CommutedOpc)); - unsigned Src0Reg = Src0.getReg(); + Register Src0Reg = Src0.getReg(); unsigned Src0SubReg = Src0.getSubReg(); bool Src0Kill = Src0.isKill(); @@ -4039,13 +4140,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); const DebugLoc &DL = MI.getDebugLoc(); if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src1); Src1.ChangeToRegister(Reg, false); } if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src2); Src2.ChangeToRegister(Reg, false); @@ -4113,12 +4214,12 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const { const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); - unsigned DstReg = MRI.createVirtualRegister(SRC); + Register DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; if (RI.hasAGPRs(VRC)) { VRC = RI.getEquivalentVGPRClass(VRC); - unsigned NewSrcReg = MRI.createVirtualRegister(VRC); + Register NewSrcReg = MRI.createVirtualRegister(VRC); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(TargetOpcode::COPY), NewSrcReg) .addReg(SrcReg); @@ -4134,7 +4235,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, SmallVector<unsigned, 8> SRegs; for (unsigned i = 0; i < SubRegs; ++i) { - unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), SGPR) .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); @@ -4176,7 +4277,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const { - unsigned OpReg = Op.getReg(); + Register OpReg = Op.getReg(); unsigned OpSubReg = Op.getSubReg(); const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( @@ -4186,7 +4287,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, if (DstRC == OpRC) return; - unsigned DstReg = MRI.createVirtualRegister(DstRC); + Register DstReg = MRI.createVirtualRegister(DstRC); MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); @@ -4198,8 +4299,19 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; // Try to eliminate the copy if it is copying an immediate value. - if (Def->isMoveImmediate()) + if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) FoldImmediate(*Copy, *Def, OpReg, &MRI); + + bool ImpDef = Def->isImplicitDef(); + while (!ImpDef && Def && Def->isCopy()) { + if (Def->getOperand(1).getReg().isPhysical()) + break; + Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); + ImpDef = Def && Def->isImplicitDef(); + } + if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && + !ImpDef) + Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); } // Emit the actual waterfall loop, executing the wrapped instruction for each @@ -4223,18 +4335,18 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock::iterator I = LoopBB.begin(); - unsigned VRsrc = Rsrc.getReg(); + Register VRsrc = Rsrc.getReg(); unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); - unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); - unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); - unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC); - unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond = MRI.createVirtualRegister(BoolXExecRC); + Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); // Beginning of the loop, read the next Rsrc variant. BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) @@ -4302,7 +4414,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); @@ -4370,10 +4482,10 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); // Zero64 = 0 @@ -4430,7 +4542,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { if (!MI.getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + !Register::isVirtualRegister(MI.getOperand(i).getReg())) continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); @@ -4447,8 +4559,16 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) - : RI.getEquivalentVGPRClass(SRC); + if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { + VRC = &AMDGPU::VReg_1RegClass; + } else + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); + } else { + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(VRC) + : RI.getEquivalentVGPRClass(VRC); } RC = VRC; } else { @@ -4458,7 +4578,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Update all the operands so they have the same type. for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) continue; // MI is a PHI instruction. @@ -4483,7 +4603,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // subregister index types e.g. sub0_sub1 + sub2 + sub3 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) continue; const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); @@ -4502,8 +4622,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize INSERT_SUBREG // src0 must have the same register class as dst if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src0 = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { @@ -4577,13 +4697,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); - unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); unsigned RsrcPtr, NewSRsrc; std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); @@ -4623,7 +4743,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, unsigned RsrcPtr, NewSRsrc; std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); @@ -4661,6 +4781,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, MIB.addImm(TFE->getImm()); } + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); + MIB.cloneMemRefs(MI); Addr64 = MIB; } else { @@ -4933,8 +5055,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { - unsigned DstReg = Inst.getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + Register DstReg = Inst.getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg)) continue; // Update the destination register class. @@ -4943,7 +5065,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, continue; if (Inst.isCopy() && - TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && + Register::isVirtualRegister(Inst.getOperand(1).getReg()) && NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { // Instead of creating a copy where src and dst are the same register // class, we just replace all uses of dst with src. These kinds of @@ -4988,8 +5110,8 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - unsigned OldDstReg = Inst.getOperand(0).getReg(); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register OldDstReg = Inst.getOperand(0).getReg(); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned Opc = Inst.getOpcode(); assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); @@ -5022,8 +5144,8 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SubOp = ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; @@ -5052,7 +5174,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, MachineOperand &Src1 = Inst.getOperand(2); if (ST.hasDLInsts()) { - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); @@ -5072,8 +5194,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, bool Src1IsSGPR = Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); MachineInstr *Xor; - unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); // Build a pair of scalar instructions and add them to the work list. // The next iteration over the work list will lower these to the vector @@ -5117,8 +5239,8 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) .add(Src0) @@ -5146,8 +5268,8 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) .add(Src1); @@ -5189,16 +5311,16 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -5226,12 +5348,12 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned CarryReg = MRI.createVirtualRegister(CarryRC); - unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC); + Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); @@ -5327,17 +5449,17 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) .add(SrcReg0Sub0) .add(SrcReg1Sub0); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .add(SrcReg0Sub1) .add(SrcReg1Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -5368,7 +5490,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineOperand* Op0; MachineOperand* Op1; @@ -5384,7 +5506,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) .add(*Op0); - unsigned NewDest = MRI.createVirtualRegister(DestRC); + Register NewDest = MRI.createVirtualRegister(DestRC); MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) .addReg(Interm) @@ -5411,8 +5533,8 @@ void SIInstrInfo::splitScalar64BitBCNT( MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; - unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); @@ -5451,9 +5573,9 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, Offset == 0 && "Not implemented"); if (BitWidth < 32) { - unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) @@ -5476,8 +5598,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, } MachineOperand &Src = Inst.getOperand(1); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) .addImm(31) @@ -5506,6 +5628,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( switch (UseMI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: @@ -5531,7 +5654,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineBasicBlock *MBB = Inst.getParent(); MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); @@ -5539,8 +5662,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, switch (Inst.getOpcode()) { case AMDGPU::S_PACK_LL_B32_B16: { - unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // FIXME: Can do a lot better if we know the high bits of src0 or src1 are // 0. @@ -5558,7 +5681,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, break; } case AMDGPU::S_PACK_LH_B32_B16: { - unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) .addImm(0xffff); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) @@ -5568,8 +5691,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, break; } case AMDGPU::S_PACK_HH_B32_B16: { - unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) .addImm(16) .add(Src0); @@ -5623,17 +5746,27 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); if (RI.hasAGPRs(SrcRC)) { if (RI.hasAGPRs(NewDstRC)) return nullptr; - NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + switch (Inst.getOpcode()) { + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + break; + default: + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + } + if (!NewDstRC) return nullptr; } else { - if (RI.hasVGPRs(NewDstRC)) + if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); @@ -5686,7 +5819,7 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, return MO.getReg(); // If this could be a VGPR or an SGPR, Check the dynamic register class. - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); if (RI.isSGPRClass(RegRC)) UsedSGPRs[i] = Reg; @@ -5941,7 +6074,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstr *SIIF = BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) .add(Branch->getOperand(0)) @@ -5968,8 +6101,8 @@ void SIInstrInfo::convertNonUniformLoopRegion( if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); - unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstrBuilder HeaderPHIBuilder = BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), @@ -5979,7 +6112,7 @@ void SIInstrInfo::convertNonUniformLoopRegion( HeaderPHIBuilder.addReg(BackEdgeReg); } else { MachineBasicBlock *PMBB = *PI; - unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), ZeroReg, 0); HeaderPHIBuilder.addReg(ZeroReg); @@ -6063,13 +6196,30 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); + Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); } +MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DestReg, + RegScavenger &RS) const { + if (ST.hasAddNoCarry()) + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); + + Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false); + // TODO: Users need to deal with this. + if (!UnusedCarry.isValid()) + return MachineInstrBuilder(); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} + bool SIInstrInfo::isKillTerminator(unsigned Opcode) { switch (Opcode) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: @@ -6115,7 +6265,21 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return false; const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; - return RCID == AMDGPU::SReg_128RegClassID; + return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); +} + +unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, + bool Signed) const { + if (!ST.hasFlatInstOffsets()) + return 0; + + if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) + return 0; + + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) + return Signed ? 12 : 11; + + return Signed ? 13 : 12; } bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, @@ -6254,7 +6418,7 @@ static bool followSubRegDef(MachineInstr &MI, MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI) { assert(MRI.isSSA()); - if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) + if (!Register::isVirtualRegister(P.Reg)) return nullptr; auto RSR = P; @@ -6265,8 +6429,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, case AMDGPU::COPY: case AMDGPU::V_MOV_B32_e32: { auto &Op1 = MI->getOperand(1); - if (Op1.isReg() && - TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { + if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { if (Op1.isUndef()) return nullptr; RSR = getRegSubRegPair(Op1); @@ -6360,3 +6523,40 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, return true; } } + +MachineInstr *SIInstrInfo::createPHIDestinationCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, + const DebugLoc &DL, Register Src, Register Dst) const { + auto Cur = MBB.begin(); + if (Cur != MBB.end()) + do { + if (!Cur->isPHI() && Cur->readsRegister(Dst)) + return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); + ++Cur; + } while (Cur != MBB.end() && Cur != LastPHIIt); + + return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, + Dst); +} + +MachineInstr *SIInstrInfo::createPHISourceCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, Register SrcSubReg, Register Dst) const { + if (InsPt != MBB.end() && + (InsPt->getOpcode() == AMDGPU::SI_IF || + InsPt->getOpcode() == AMDGPU::SI_ELSE || + InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && + InsPt->definesRegister(Src)) { + InsPt++; + return BuildMI(MBB, InsPt, InsPt->getDebugLoc(), + get(ST.isWave32() ? AMDGPU::S_MOV_B32_term + : AMDGPU::S_MOV_B64_term), + Dst) + .addReg(Src, 0, SrcSubReg) + .addReg(AMDGPU::EXEC, RegState::Implicit); + } + return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, + Dst); +} + +bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 3ff35da0b963..be463442c888 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -173,7 +173,7 @@ public: } bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const override; + AAResults *AA) const override; bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, @@ -229,6 +229,14 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; + // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp + // instructions. Returns a pair of generated instructions. + // Can split either post-RA with physical registers or pre-RA with + // virtual registers. In latter case IR needs to be in SSA form and + // and a REG_SEQUENCE is produced to define original register. + std::pair<MachineInstr*, MachineInstr*> + expandMovDPP64(MachineInstr &MI) const; + // Returns an opcode that can be used to move a value to a \p DstRC // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. @@ -242,7 +250,7 @@ public: return commuteOpcode(MI.getOpcode()); } - bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, + bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0, @@ -303,8 +311,7 @@ public: bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; bool isFoldableCopy(const MachineInstr &MI) const; @@ -578,6 +585,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsMAI; } + static bool isDOT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; + } + + bool isDOT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsDOT; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -954,6 +969,19 @@ public: bool isBasicBlockPrologue(const MachineInstr &MI) const override; + MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register Dst) const override; + + MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register SrcSubReg, + Register Dst) const override; + + bool isWave32() const; + /// Return a partially built integer add instruction without carry. /// Caller must add source operands. /// For pre-GFX9 it will generate unused carry destination operand. @@ -963,6 +991,12 @@ public: const DebugLoc &DL, unsigned DestReg) const; + MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DestReg, + RegScavenger &RS) const; + static bool isKillTerminator(unsigned Opcode); const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; @@ -970,6 +1004,8 @@ public: return isUInt<12>(Imm); } + unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const; + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT /// encoded instruction. If \p Signed, this is for an instruction that /// interprets the offset as signed. diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c382c816e0b4..1eecbf555613 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, @@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, @@ -198,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; +def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; @@ -264,6 +266,11 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; +def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", + SDTypeProfile<0 ,1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue] +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -277,7 +284,9 @@ class isFloatType<ValueType SrcVT> { !if(!eq(SrcVT.Value, f64.Value), 1, !if(!eq(SrcVT.Value, v2f16.Value), 1, !if(!eq(SrcVT.Value, v4f16.Value), 1, - 0))))); + !if(!eq(SrcVT.Value, v2f32.Value), 1, + !if(!eq(SrcVT.Value, v2f64.Value), 1, + 0))))))); } class isIntType<ValueType SrcVT> { @@ -300,14 +309,36 @@ class isPackedType<ValueType SrcVT> { // PatFrags for global memory operations //===----------------------------------------------------------------------===// -defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>; -defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; +foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { + -def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>; -def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>; -def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>; -def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>; -def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>; +defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>; +defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>; +defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>; +defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>; + + +} // End let AddressSpaces = ... +} // End foreach AddrSpace + +def atomic_fadd_global_noret : PatFrag< + (ops node:$ptr, node:$value), + (SIglobal_atomic_fadd node:$ptr, node:$value)> { + // FIXME: Move this + let MemoryVT = f32; + let IsAtomic = 1; + let AddressSpaces = StoreAddress_global.AddrSpaces; +} + +def atomic_pk_fadd_global_noret : PatFrag< + (ops node:$ptr, node:$value), + (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> { + // FIXME: Move this + let MemoryVT = v2f16; + let IsAtomic = 1; + let AddressSpaces = StoreAddress_global.AddrSpaces; +} //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -328,10 +359,12 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad, >; def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> { + let IsLoad = 1; let IsUnindexed = 1; } def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsLoad = 1; let IsNonExtLoad = 1; } @@ -347,14 +380,15 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr), let MemoryVT = i64; } -def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { +def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { let IsLoad = 1; let IsAnyExtLoad = 1; } -def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; -}]>; +def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsLoad = 1; + let IsSignExtLoad = 1; +} def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { let IsLoad = 1; @@ -391,25 +425,50 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> { let MemoryVT = i16; } -def load_glue_align8 : Aligned8Bytes < - (ops node:$ptr), (load_glue node:$ptr) ->; -def load_glue_align16 : Aligned16Bytes < - (ops node:$ptr), (load_glue node:$ptr) ->; +let IsLoad = 1, AddressSpaces = LoadAddress_local.AddrSpaces in { +def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { + let IsNonExtLoad = 1; +} -def load_local_m0 : LoadFrag<load_glue>, LocalAddress; -def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress; -def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress; -def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress; -def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress; -def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress; -def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress; -def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress; -def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress; -def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress; -def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress; +let MemoryVT = i8 in { +def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>; +def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>; +def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>; +} + +let MemoryVT = i16 in { +def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>; +def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>; +def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>; +} + +def load_align8_local_m0 : PatFrag<(ops node:$ptr), + (load_local_m0 node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 8; +} +def load_align16_local_m0 : PatFrag<(ops node:$ptr), + (load_local_m0 node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 16; +} + +} // End IsLoad = 1 + +let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in { +def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_32_glue node:$ptr)> { + let MemoryVT = i32; +} +def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_64_glue node:$ptr)> { + let MemoryVT = i64; +} + +} // End let AddressSpaces = LoadAddress_local.AddrSpaces def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, @@ -420,50 +479,88 @@ def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] >; -def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val), - (AMDGPUatomic_st_glue node:$ptr, node:$val)> { -} - def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUst_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; -}]>; + (AMDGPUst_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsUnindexed = 1; +} def store_glue : PatFrag<(ops node:$val, node:$ptr), - (unindexedstore_glue node:$val, node:$ptr), [{ - return !cast<StoreSDNode>(N)->isTruncatingStore(); -}]>; + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} def truncstore_glue : PatFrag<(ops node:$val, node:$ptr), - (unindexedstore_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->isTruncatingStore(); -}]>; + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 1; +} def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr), - (truncstore_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; + (truncstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr), - (truncstore_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; + (truncstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} -def store_glue_align8 : Aligned8Bytes < - (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) ->; +let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in { +def store_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (store_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} -def store_glue_align16 : Aligned16Bytes < - (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) ->; +def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} + +def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} +} + +def store_align16_local_m0 : PatFrag < + (ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; + let MinAlignment = 16; +} -def store_local_m0 : StoreFrag<store_glue>, LocalAddress; -def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress; -def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress; -def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress; +def store_align8_local_m0 : PatFrag < + (ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; + let MinAlignment = 8; +} + +let AddressSpaces = StoreAddress_local.AddrSpaces in { + +def atomic_store_local_32_m0 : PatFrag < + (ops node:$value, node:$ptr), + (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i32; +} +def atomic_store_local_64_m0 : PatFrag < + (ops node:$value, node:$ptr), + (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i64; +} +} // End let AddressSpaces = StoreAddress_local.AddrSpaces -def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress; -def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress; def si_setcc_uniform : PatFrag < (ops node:$lhs, node:$rhs, node:$cond), @@ -539,16 +636,27 @@ def lshl_rev : PatFrag < (shl $src0, $src1) >; +def add_ctpop : PatFrag < + (ops node:$src0, node:$src1), + (add (ctpop $src0), $src1) +>; + multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, - SDTypeProfile tc = SDTAtomic2> { + SDTypeProfile tc = SDTAtomic2, + bit IsInt = 1> { def _glue : SDNode < !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; - def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; - def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + let AddressSpaces = StoreAddress_local.AddrSpaces in { + defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; + } + + let AddressSpaces = StoreAddress_region.AddrSpaces in { + defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; + } } defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; @@ -563,17 +671,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; -defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>; -defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; -defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; - -def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] ->; - -def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>; -def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>; - +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; def as_i1imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); @@ -591,6 +691,10 @@ def as_i32imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); }]>; +def as_i32timm: SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + def as_i64imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; @@ -627,9 +731,13 @@ def SIMM16bit : ImmLeaf <i32, >; def UIMM16bit : ImmLeaf <i32, - [{return isUInt<16>(Imm); }] + [{return isUInt<16>(Imm);}] >; +def i64imm_32bit : ImmLeaf<i64, [{ + return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm); +}]>; + class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; @@ -763,6 +871,18 @@ def ExpTgtMatchClass : AsmOperandClass { let RenderMethod = "printExpTgt"; } +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} + +def VReg32OrOffClass : AsmOperandClass { + let Name = "VReg32OrOff"; + let ParserMethod = "parseVReg32OrOff"; +} + +let OperandType = "OPERAND_IMMEDIATE" in { def SendMsgImm : Operand<i32> { let PrintMethod = "printSendMsg"; let ParserMatchClass = SendMsgMatchClass; @@ -778,22 +898,11 @@ def EndpgmImm : Operand<i16> { let ParserMatchClass = EndpgmMatchClass; } -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def VReg32OrOffClass : AsmOperandClass { - let Name = "VReg32OrOff"; - let ParserMethod = "parseVReg32OrOff"; -} - def WAIT_FLAG : Operand <i32> { let ParserMatchClass = SWaitMatchClass; let PrintMethod = "printWaitFlag"; - let OperandType = "OPERAND_IMMEDIATE"; } +} // End OperandType = "OPERAND_IMMEDIATE" include "SIInstrFormats.td" include "VIInstrFormats.td" @@ -929,6 +1038,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; @@ -1317,18 +1427,6 @@ class getVALUDstForVT<ValueType VT> { VOPDstS64orS32)))); // else VT == i1 } -// Returns true if VT is floating point. -class getIsFP<ValueType VT> { - bit ret = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, v2f16.Value), 1, - !if(!eq(VT.Value, v4f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, v2f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - !if(!eq(VT.Value, v2f64.Value), 1, - 0))))))); -} - // Returns the register class to use for the destination of VOP[12C] // instructions with SDWA extension class getSDWADstForVT<ValueType VT> { @@ -1340,7 +1438,7 @@ class getSDWADstForVT<ValueType VT> { // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(isFP, @@ -1373,11 +1471,14 @@ class getVOPSrc0ForVT<ValueType VT> { // Returns the vreg register class to use for source operand given VT class getVregSrcForVT<ValueType VT> { RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, - !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); + !if(!eq(VT.Size, 96), VReg_96, + !if(!eq(VT.Size, 64), VReg_64, + !if(!eq(VT.Size, 48), VReg_64, + VGPR_32)))); } class getSDWASrcForVT <ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); RegisterOperand ret = !if(isFP, retFlt, retInt); @@ -1386,7 +1487,7 @@ class getSDWASrcForVT <ValueType VT> { // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), VSrc_128, @@ -1433,7 +1534,7 @@ class isModifierType<ValueType SrcVT> { // Return type of input modifiers operand for specified input operand class getSrcMod <ValueType VT, bit EnableF32SrcMods> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), @@ -1452,7 +1553,7 @@ class getOpSelMod <ValueType VT> { // Return type of input modifiers operand specified input operand for DPP class getSrcModExt <ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } @@ -2038,6 +2139,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field int NeedPatGen = PatGenMode.NoPattern; field bit IsMAI = 0; + field bit IsDOT = 0; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 70f20bb69370..21984c6ad910 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -43,8 +43,8 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m < (outs VINTRPDst:$vdst), (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan), - (i32 imm:$attr)))] + [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, + (i32 timm:$attrchan), (i32 timm:$attr), M0))] >; let OtherPredicates = [has32BankLDS] in { @@ -66,8 +66,8 @@ defm V_INTERP_P2_F32 : VINTRP_m < (outs VINTRPDst:$vdst), (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan), - (i32 imm:$attr)))]>; + [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, + (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; } // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" @@ -76,8 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m < (outs VINTRPDst:$vdst), (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan), - (i32 imm:$attr)))]>; + [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc), + (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; } // End Uses = [M0, EXEC] @@ -92,6 +92,11 @@ def ATOMIC_FENCE : SPseudoInstSI< let maybeAtomic = 1; } +def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> { + let HasExt = 1; + let HasExtDPP = 1; +} + let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns @@ -107,10 +112,19 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; +// 64-bit vector move with dpp. Expanded post-RA. +def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> { + let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. +} + // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the // WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is +// turned into a copy by WQM pass, but does not seed WQM requirements. +def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; + // Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so // that the @earlyclobber is respected. The @earlyclobber is to make sure that // the instruction that defines $src0 (which is run in WWM) doesn't @@ -345,13 +359,15 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { } def SI_INIT_EXEC : SPseudoInstSI < - (outs), (ins i64imm:$src), []> { + (outs), (ins i64imm:$src), + [(int_amdgcn_init_exec (i64 timm:$src))]> { let Defs = [EXEC]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; let WaveSizePredicate = isWave64; } +// FIXME: Intrinsic should be mangled for wave size. def SI_INIT_EXEC_LO : SPseudoInstSI < (outs), (ins i32imm:$src), []> { let Defs = [EXEC_LO]; @@ -360,12 +376,20 @@ def SI_INIT_EXEC_LO : SPseudoInstSI < let WaveSizePredicate = isWave32; } +// FIXME: Wave32 version def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < - (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + (outs), (ins SSrc_b32:$input, i32imm:$shift), + [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { let Defs = [EXEC]; let usesCustomInserter = 1; } +def : GCNPat < + (int_amdgcn_init_exec timm:$src), + (SI_INIT_EXEC_LO (as_i32imm imm:$src))> { + let WaveSizePredicate = isWave32; +} + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -604,25 +628,6 @@ def : GCNPat < (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) >; -def : GCNPat < - (AMDGPUinit_exec i64:$src), - (SI_INIT_EXEC (as_i64imm $src)) -> { - let WaveSizePredicate = isWave64; -} - -def : GCNPat < - (AMDGPUinit_exec i64:$src), - (SI_INIT_EXEC_LO (as_i32imm $src)) -> { - let WaveSizePredicate = isWave32; -} - -def : GCNPat < - (AMDGPUinit_exec_from_input i32:$input, i32:$shift), - (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) ->; - def : GCNPat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) @@ -740,22 +745,22 @@ def : GCNPat < def : GCNPat < (i32 (fp_to_sint f16:$src)), - (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src)) + (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) >; def : GCNPat < (i32 (fp_to_uint f16:$src)), - (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src)) + (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) >; def : GCNPat < (f16 (sint_to_fp i32:$src)), - (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src)) + (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src)) >; def : GCNPat < (f16 (uint_to_fp i32:$src)), - (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src)) + (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src)) >; //===----------------------------------------------------------------------===// @@ -808,8 +813,14 @@ def : GCNPat < (V_BCNT_U32_B32_e64 $popcnt, $val) >; } + def : GCNPat < - (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)), + (i32 (ctpop i32:$popcnt)), + (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) +>; + +def : GCNPat < + (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; @@ -1076,53 +1087,158 @@ def : GCNPat < /********** ================================ **********/ // Prevent expanding both fneg and fabs. +// TODO: Add IgnoredBySelectionDAG bit? +let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG def : GCNPat < - (fneg (fabs f32:$src)), - (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit + (fneg (fabs (f32 SReg_32:$src))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit >; -// FIXME: Should use S_OR_B32 def : GCNPat < - (fneg (fabs f64:$src)), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. - sub1) + (fabs (f32 SReg_32:$src)), + (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) +>; + +def : GCNPat < + (fneg (f32 SReg_32:$src)), + (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) +>; + +def : GCNPat < + (fneg (f16 SReg_32:$src)), + (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) +>; + +def : GCNPat < + (fneg (f16 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) +>; + +def : GCNPat < + (fabs (f16 SReg_32:$src)), + (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) +>; + +def : GCNPat < + (fneg (fabs (f16 SReg_32:$src))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit +>; + +def : GCNPat < + (fneg (fabs (f16 VGPR_32:$src))), + (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit +>; + +def : GCNPat < + (fneg (v2f16 SReg_32:$src)), + (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) +>; + +def : GCNPat < + (fabs (v2f16 SReg_32:$src)), + (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) +>; + +// This is really (fneg (fabs v2f16:$src)) +// +// fabs is not reported as free because there is modifier for it in +// VOP3P instructions, so it is turned into the bit op. +def : GCNPat < + (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; def : GCNPat < - (fabs f32:$src), - (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff))) + (fneg (v2f16 (fabs SReg_32:$src))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit +>; + +// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled + // def : GCNPat < +// (fneg (f64 SReg_64:$src)), +// (REG_SEQUENCE SReg_64, +// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), +// sub0, +// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), +// (i32 (S_MOV_B32 (i32 0x80000000)))), +// sub1) +// >; + +// def : GCNPat < +// (fneg (fabs (f64 SReg_64:$src))), +// (REG_SEQUENCE SReg_64, +// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), +// sub0, +// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), +// (S_MOV_B32 (i32 0x80000000))), // Set sign bit. +// sub1) +// >; + +} // End let AddedComplexity = 1 + +def : GCNPat < + (fabs (f32 VGPR_32:$src)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (f32 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) +>; + +def : GCNPat < + (fabs (f16 VGPR_32:$src)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) >; def : GCNPat < - (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) + (fneg (v2f16 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; def : GCNPat < - (fabs f64:$src), + (fabs (v2f16 VGPR_32:$src)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (v2f16 (fabs VGPR_32:$src))), + (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit +>; + +def : GCNPat < + (fabs (f64 VReg_64:$src)), (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. sub1) >; +// TODO: Use SGPR for constant def : GCNPat < - (fneg f64:$src), + (fneg (f64 VReg_64:$src)), (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), (i32 (V_MOV_B32_e32 (i32 0x80000000)))), sub1) >; +// TODO: Use SGPR for constant +def : GCNPat < + (fneg (fabs (f64 VReg_64:$src))), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), + sub0, + (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. + sub1) +>; + def : GCNPat < (fcopysign f16:$src0, f16:$src1), (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) @@ -1154,45 +1270,6 @@ def : GCNPat < (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; -def : GCNPat < - (fneg f16:$src), - (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000))) ->; - -def : GCNPat < - (fabs f16:$src), - (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff))) ->; - -def : GCNPat < - (fneg (fabs f16:$src)), - (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit ->; - -def : GCNPat < - (fneg v2f16:$src), - (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000))) ->; - -def : GCNPat < - (fabs v2f16:$src), - (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff))) ->; - -// This is really (fneg (fabs v2f16:$src)) -// -// fabs is not reported as free because there is modifier for it in -// VOP3P instructions, so it is turned into the bit op. -def : GCNPat < - (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), - (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit ->; - -def : GCNPat < - (fneg (v2f16 (fabs v2f16:$src))), - (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit ->; - /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ @@ -1544,7 +1621,7 @@ def : GCNPat < (V_CVT_F16_F32_e32 ( V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), - $src)) + SSrc_i1:$src)) >; def : GCNPat < @@ -1552,35 +1629,35 @@ def : GCNPat < (V_CVT_F16_F32_e32 ( V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), - $src)) + SSrc_i1:$src)) >; def : GCNPat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), - $src) + SSrc_i1:$src) >; def : GCNPat < (f32 (uint_to_fp i1:$src)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), - $src) + SSrc_i1:$src) >; def : GCNPat < (f64 (sint_to_fp i1:$src)), (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 -1), - $src)) + SSrc_i1:$src)) >; def : GCNPat < (f64 (uint_to_fp i1:$src)), (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 1), - $src)) + SSrc_i1:$src)) >; //===----------------------------------------------------------------------===// @@ -1788,6 +1865,22 @@ def : GCNPat < (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) >; +def : GCNPat < + (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, + timm:$bound_ctrl)), + (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) +>; + +def : GCNPat < + (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), + (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) +>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -1915,3 +2008,13 @@ def : FP16Med3Pat<f16, V_MED3_F16>; defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>; defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>; } // End Predicates = [isGFX9Plus] + +class AMDGPUGenericInstruction : GenericInstruction { + let Namespace = "AMDGPU"; +} + +def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ae8b967893a2..20db1c37f354 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -42,10 +42,7 @@ // // Future improvements: // -// - This currently relies on the scheduler to place loads and stores next to -// each other, and then only merges adjacent pairs of instructions. It would -// be good to be more flexible with interleaved instructions, and possibly run -// before scheduling. It currently missing stores of constants because loading +// - This is currently missing stores of constants because loading // the constant into the data register is placed between the stores, although // this is arguably a scheduling problem. // @@ -98,14 +95,9 @@ enum InstClassEnum { DS_READ, DS_WRITE, S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, - BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, - BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, - BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, - BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, - BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, - BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, + BUFFER_LOAD, + BUFFER_STORE, + MIMG, }; enum RegisterEnum { @@ -114,6 +106,7 @@ enum RegisterEnum { SOFFSET = 0x4, VADDR = 0x8, ADDR = 0x10, + SSAMP = 0x20, }; class SILoadStoreOptimizer : public MachineFunctionPass { @@ -126,6 +119,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned Width0; unsigned Width1; unsigned BaseOff; + unsigned DMask0; + unsigned DMask1; InstClassEnum InstClass; bool GLC0; bool GLC1; @@ -135,6 +130,60 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool DLC1; bool UseST64; SmallVector<MachineInstr *, 8> InstsToMove; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; + unsigned NumAddresses; + + bool hasSameBaseAddress(const MachineInstr &MI) { + for (unsigned i = 0; i < NumAddresses; i++) { + const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); + + if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { + if (AddrReg[i]->isImm() != AddrRegNext.isImm() || + AddrReg[i]->getImm() != AddrRegNext.getImm()) { + return false; + } + continue; + } + + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. + if (AddrReg[i]->getReg() != AddrRegNext.getReg() || + AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { + return false; + } + } + return true; + } + + bool hasMergeableAddress(const MachineRegisterInfo &MRI) { + for (unsigned i = 0; i < NumAddresses; ++i) { + const MachineOperand *AddrOp = AddrReg[i]; + // Immediates are always OK. + if (AddrOp->isImm()) + continue; + + // Don't try to merge addresses that aren't either immediates or registers. + // TODO: Should be possible to merge FrameIndexes and maybe some other + // non-register + if (!AddrOp->isReg()) + return false; + + // TODO: We should be able to merge physical reg addreses. + if (Register::isPhysicalRegister(AddrOp->getReg())) + return false; + + // If an address has only one use then there will be on other + // instructions with the same address, so we can't merge this one. + if (MRI.hasOneNonDBGUse(AddrOp->getReg())) + return false; + } + return true; + } + + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, + const GCNSubtarget &STM); + void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); }; struct BaseRegisters { @@ -160,14 +209,12 @@ private: AliasAnalysis *AA = nullptr; bool OptimizeAgain; + static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); static bool offsetsCanBeCombined(CombineInfo &CI); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); - unsigned getOpcodeWidth(const MachineInstr &MI); - InstClassEnum getInstClass(unsigned Opc); - unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -178,22 +225,27 @@ private: unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); + MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, - int32_t NewOffset); - unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); - MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); - Optional<int32_t> extractConstOffset(const MachineOperand &Op); - void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + int32_t NewOffset) const; + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; + Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; /// Promotes constant offset to the immediate by adjusting the base. It /// tries to use a base from the nearby instructions that allows it to have /// a 13bit constant offset which gets promoted to the immediate. bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, - SmallPtrSet<MachineInstr *, 4> &Promoted); + SmallPtrSet<MachineInstr *, 4> &Promoted) const; + void addInstToMergeableList(const CombineInfo &CI, + std::list<std::list<CombineInfo> > &MergeableInsts) const; + bool collectMergeableInsts(MachineBasicBlock &MBB, + std::list<std::list<CombineInfo> > &MergeableInsts) const; public: static char ID; @@ -202,7 +254,11 @@ public: initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - bool optimizeBlock(MachineBasicBlock &MBB); + void removeCombinedInst(std::list<CombineInfo> &MergeList, + const MachineInstr &MI); + bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, + bool &OptimizeListAgain); + bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); bool runOnMachineFunction(MachineFunction &MF) override; @@ -216,6 +272,264 @@ public: } }; +static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { + const unsigned Opc = MI.getOpcode(); + + if (TII.isMUBUF(Opc)) { + // FIXME: Handle d16 correctly + return AMDGPU::getMUBUFElements(Opc); + } + if (TII.isMIMG(MI)) { + uint64_t DMaskImm = + TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); + return countPopulation(DMaskImm); + } + + switch (Opc) { + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return 4; + default: + return 0; + } +} + +/// Maps instruction opcode to enum InstClassEnum. +static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { + switch (Opc) { + default: + if (TII.isMUBUF(Opc)) { + switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { + default: + return UNKNOWN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + return BUFFER_LOAD; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return BUFFER_STORE; + } + } + if (TII.isMIMG(Opc)) { + // Ignore instructions encoded without vaddr. + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) + return UNKNOWN; + // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. + if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) + return UNKNOWN; + return MIMG; + } + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +/// Determines instruction subclass from opcode. Only instructions +/// of the same subclass can be merged together. +static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { + switch (Opc) { + default: + if (TII.isMUBUF(Opc)) + return AMDGPU::getMUBUFBaseOpcode(Opc); + if (TII.isMIMG(Opc)) { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + assert(Info); + return Info->BaseOpcode; + } + return -1; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B64_gfx9: + return Opc; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + } +} + +static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { + if (TII.isMUBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMUBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMUBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMUBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + + return result; + } + + if (TII.isMIMG(Opc)) { + unsigned result = VADDR | SRSRC; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) + result |= SSAMP; + return result; + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + + +void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, + const SIInstrInfo &TII, + const GCNSubtarget &STM) { + I = MI; + unsigned Opc = MI->getOpcode(); + InstClass = getInstClass(Opc, TII); + + if (InstClass == UNKNOWN) + return; + + switch (InstClass) { + case DS_READ: + EltSize = + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; + break; + case DS_WRITE: + EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; + break; + case S_BUFFER_LOAD_IMM: + EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); + break; + default: + EltSize = 4; + break; + } + + if (InstClass == MIMG) { + DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); + } else { + int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); + Offset0 = I->getOperand(OffsetIdx).getImm(); + } + + Width0 = getOpcodeWidth(*I, TII); + + if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { + Offset0 &= 0xffff; + } else if (InstClass != MIMG) { + GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); + if (InstClass != S_BUFFER_LOAD_IMM) { + SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); + } + DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); + } + + unsigned AddrOpName[5] = {0}; + NumAddresses = 0; + const unsigned Regs = getRegs(I->getOpcode(), TII); + + if (Regs & ADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; + } + + if (Regs & SBASE) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; + } + + if (Regs & SRSRC) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; + } + + if (Regs & SSAMP) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; + } + + for (unsigned i = 0; i < NumAddresses; i++) { + AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); + AddrReg[i] = &I->getOperand(AddrIdx[i]); + } + + InstsToMove.clear(); +} + +void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, + const SIInstrInfo &TII) { + Paired = MI; + assert(InstClass == getInstClass(Paired->getOpcode(), TII)); + + if (InstClass == MIMG) { + DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm(); + } else { + int OffsetIdx = + AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); + Offset1 = Paired->getOperand(OffsetIdx).getImm(); + } + + Width1 = getOpcodeWidth(*Paired, TII); + if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { + Offset1 &= 0xffff; + } else if (InstClass != MIMG) { + GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); + if (InstClass != S_BUFFER_LOAD_IMM) { + SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); + } + DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); + } +} + + } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, @@ -249,8 +563,7 @@ static void addDefsUsesToList(const MachineInstr &MI, if (Op.isReg()) { if (Op.isDef()) RegDefs.insert(Op.getReg()); - else if (Op.readsReg() && - TargetRegisterInfo::isPhysicalRegister(Op.getReg())) + else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) PhysRegUses.insert(Op.getReg()); } } @@ -282,7 +595,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || (Use.isDef() && RegDefs.count(Use.getReg())) || - (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && + (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); addDefsUsesToList(MI, RegDefs, PhysRegUses); @@ -307,7 +620,59 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, return true; } +// This function assumes that \p A and \p B have are identical except for +// size and offset, and they referecne adjacent memory. +static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, + const MachineMemOperand *A, + const MachineMemOperand *B) { + unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); + unsigned Size = A->getSize() + B->getSize(); + // This function adds the offset parameter to the existing offset for A, + // so we pass 0 here as the offset and then manually set it to the correct + // value after the call. + MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); + MMO->setOffset(MinOffset); + return MMO; +} + +bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) { + assert(CI.InstClass == MIMG); + + // Ignore instructions with tfe/lwe set. + const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); + const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); + + if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) + return false; + + // Check other optional immediate operands for equality. + unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, + AMDGPU::OpName::d16, AMDGPU::OpName::unorm, + AMDGPU::OpName::da, AMDGPU::OpName::r128}; + + for (auto op : OperandsToMatch) { + int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); + if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx) + return false; + if (Idx != -1 && + CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm()) + return false; + } + + // Check DMask for overlaps. + unsigned MaxMask = std::max(CI.DMask0, CI.DMask1); + unsigned MinMask = std::min(CI.DMask0, CI.DMask1); + + unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); + if ((1u << AllowedBitsForMin) <= MinMask) + return false; + + return true; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { + assert(CI.InstClass != MIMG); + // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? if (CI.Offset0 == CI.Offset1) @@ -384,164 +749,24 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, } } -unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { - const unsigned Opc = MI.getOpcode(); - - if (TII->isMUBUF(MI)) { - return AMDGPU::getMUBUFDwords(Opc); - } - - switch (Opc) { - default: - return 0; - case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - return 1; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - return 2; - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return 4; - } -} - -InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { - if (TII->isMUBUF(Opc)) { - const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); - - // If we couldn't identify the opcode, bail out. - if (baseOpcode == -1) { - return UNKNOWN; - } - - switch (baseOpcode) { - default: - return UNKNOWN; - case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: - return BUFFER_LOAD_OFFEN; - case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: - return BUFFER_LOAD_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - return BUFFER_STORE_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return BUFFER_STORE_OFFSET; - case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: - return BUFFER_LOAD_OFFEN_exact; - case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: - return BUFFER_LOAD_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - return BUFFER_STORE_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return BUFFER_STORE_OFFSET_exact; - } - } - - switch (Opc) { - default: - return UNKNOWN; - case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return S_BUFFER_LOAD_IMM; - case AMDGPU::DS_READ_B32: - case AMDGPU::DS_READ_B64: - case AMDGPU::DS_READ_B32_gfx9: - case AMDGPU::DS_READ_B64_gfx9: - return DS_READ; - case AMDGPU::DS_WRITE_B32: - case AMDGPU::DS_WRITE_B64: - case AMDGPU::DS_WRITE_B32_gfx9: - case AMDGPU::DS_WRITE_B64_gfx9: - return DS_WRITE; - } -} - -unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { - if (TII->isMUBUF(Opc)) { - unsigned result = 0; - - if (AMDGPU::getMUBUFHasVAddr(Opc)) { - result |= VADDR; - } - - if (AMDGPU::getMUBUFHasSrsrc(Opc)) { - result |= SRSRC; - } - - if (AMDGPU::getMUBUFHasSoffset(Opc)) { - result |= SOFFSET; - } - - return result; - } - - switch (Opc) { - default: - return 0; - case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return SBASE; - case AMDGPU::DS_READ_B32: - case AMDGPU::DS_READ_B64: - case AMDGPU::DS_READ_B32_gfx9: - case AMDGPU::DS_READ_B64_gfx9: - case AMDGPU::DS_WRITE_B32: - case AMDGPU::DS_WRITE_B64: - case AMDGPU::DS_WRITE_B32_gfx9: - case AMDGPU::DS_WRITE_B64_gfx9: - return ADDR; - } -} - bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; const unsigned Opc = CI.I->getOpcode(); - const InstClassEnum InstClass = getInstClass(Opc); + const InstClassEnum InstClass = getInstClass(Opc, *TII); if (InstClass == UNKNOWN) { return false; } + const unsigned InstSubclass = getInstSubclass(Opc, *TII); - const unsigned Regs = getRegs(Opc); - - unsigned AddrOpName[5] = {0}; - int AddrIdx[5]; - const MachineOperand *AddrReg[5]; - unsigned NumAddresses = 0; - - if (Regs & ADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - } - - if (Regs & SBASE) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - } - - if (Regs & SRSRC) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - } - - if (Regs & SOFFSET) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - } - - if (Regs & VADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - } - - for (unsigned i = 0; i < NumAddresses; i++) { - AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); - AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - - // We only ever merge operations with the same base address register, so - // don't bother scanning forward if there are no other uses. - if (AddrReg[i]->isReg() && - (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) - return false; - } + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) + return false; ++MBBI; @@ -550,11 +775,10 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for (; MBBI != E; ++MBBI) { - const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); - if ((getInstClass(MBBI->getOpcode()) != InstClass) || - (IsDS && (MBBI->getOpcode() != Opc))) { - // This is not a matching DS instruction, but we can keep looking as + if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || + (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { + // This is not a matching instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. // 2. It is safe to move MBBI down past the instruction that I will @@ -599,58 +823,23 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { CI.InstsToMove)) continue; - bool Match = true; - for (unsigned i = 0; i < NumAddresses; i++) { - const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); - - if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { - if (AddrReg[i]->isImm() != AddrRegNext.isImm() || - AddrReg[i]->getImm() != AddrRegNext.getImm()) { - Match = false; - break; - } - continue; - } - - // Check same base pointer. Be careful of subregisters, which can occur - // with vectors of pointers. - if (AddrReg[i]->getReg() != AddrRegNext.getReg() || - AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { - Match = false; - break; - } - } + bool Match = CI.hasSameBaseAddress(*MBBI); if (Match) { - int OffsetIdx = - AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); - CI.Width0 = getOpcodeWidth(*CI.I); - CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); - CI.Width1 = getOpcodeWidth(*MBBI); - CI.Paired = MBBI; + CI.setPaired(MBBI, *TII); - if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { - CI.Offset0 &= 0xffff; - CI.Offset1 &= 0xffff; - } else { - CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); - CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); - if (CI.InstClass != S_BUFFER_LOAD_IMM) { - CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); - CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); - } - CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); - CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm(); - } + // Check both offsets (or masks for MIMG) can be combined and fit in the + // reduced range. + bool canBeCombined = + CI.InstClass == MIMG + ? dmasksCanBeCombined(CI, *TII) + : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); - // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) - if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) - return true; + if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + return true; } // We've found a load/store that we couldn't merge for some reason. @@ -711,15 +900,15 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { const TargetRegisterClass *SuperRC = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = AddrReg->getReg(); + Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { - unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); @@ -755,12 +944,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Next; + return Read2; } unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { @@ -809,11 +997,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = AddrReg->getReg(); + Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { - unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); @@ -839,12 +1027,65 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Next; + return Write2; +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + Register DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedDMask = CI.DMask0 | CI.DMask1; + unsigned DMaskIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { + if (I == DMaskIdx) + MIB.addImm(MergedDMask); + else + MIB.add((*CI.I).getOperand(I)); + } + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return New; } MachineBasicBlock::iterator @@ -855,15 +1096,24 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -883,10 +1133,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -899,24 +1148,34 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); // Copy to the new source register. - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - const unsigned Regs = getRegs(Opcode); + const unsigned Regs = getRegs(Opcode, *TII); if (Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -936,10 +1195,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { @@ -947,7 +1205,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { switch (CI.InstClass) { default: - return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); + assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); + // FIXME: Handle d16 correctly + return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), + Width); case UNKNOWN: llvm_unreachable("Unknown instruction class"); case S_BUFFER_LOAD_IMM: @@ -959,76 +1220,47 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; } + case MIMG: + assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width)); + return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); } } std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { - if (CI.Offset0 > CI.Offset1) { - switch (CI.Width0) { - default: - return std::make_pair(0, 0); - case 1: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); - case 2: - return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); - case 3: - return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); - } - case 2: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); - case 2: - return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); - } - case 3: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); - } - } + + if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) + return std::make_pair(0, 0); + + bool ReverseOrder; + if (CI.InstClass == MIMG) { + assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) && + "No overlaps"); + ReverseOrder = CI.DMask0 > CI.DMask1; + } else + ReverseOrder = CI.Offset0 > CI.Offset1; + + static const unsigned Idxs[4][4] = { + {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, + {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, + {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, + {AMDGPU::sub3, 0, 0, 0}, + }; + unsigned Idx0; + unsigned Idx1; + + assert(CI.Width0 >= 1 && CI.Width0 <= 3); + assert(CI.Width1 >= 1 && CI.Width1 <= 3); + + if (ReverseOrder) { + Idx1 = Idxs[0][CI.Width1 - 1]; + Idx0 = Idxs[CI.Width1][CI.Width0 - 1]; } else { - switch (CI.Width0) { - default: - return std::make_pair(0, 0); - case 1: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); - case 2: - return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); - case 3: - return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); - } - case 2: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); - case 2: - return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); - } - case 3: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); - } - } + Idx0 = Idxs[0][CI.Width0 - 1]; + Idx1 = Idxs[CI.Width0][CI.Width1 - 1]; } + + return std::make_pair(Idx0, Idx1); } const TargetRegisterClass * @@ -1040,7 +1272,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { case 2: return &AMDGPU::SReg_64_XEXECRegClass; case 4: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 8: return &AMDGPU::SReg_256RegClass; case 16: @@ -1073,7 +1305,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { // Copy to the new source register. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - unsigned SrcReg = MRI->createVirtualRegister(SuperRC); + Register SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); @@ -1087,35 +1319,45 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - const unsigned Regs = getRegs(Opcode); + const unsigned Regs = getRegs(Opcode, *TII); if (Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineOperand -SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { APInt V(32, Val, true); if (TII->isInlineConstant(V)) return MachineOperand::CreateImm(Val); - unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineInstr *Mov = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) @@ -1127,7 +1369,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { // Compute base address using Addr and return the final register. unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, - const MemAddress &Addr) { + const MemAddress &Addr) const { MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::iterator MBBI = MI.getIterator(); DebugLoc DL = MI.getDebugLoc(); @@ -1146,11 +1388,11 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned CarryReg = MRI->createVirtualRegister(CarryRC); - unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC); + Register CarryReg = MRI->createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); - unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *LoHalf = BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) .addReg(CarryReg, RegState::Define) @@ -1170,7 +1412,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, (void)HiHalf; LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); - unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); MachineInstr *FullBase = BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) @@ -1186,13 +1428,13 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, // Update base and offset with the NewBase and NewOffset in MI. void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, unsigned NewBase, - int32_t NewOffset) { + int32_t NewOffset) const { TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); } Optional<int32_t> -SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { if (Op.isImm()) return Op.getImm(); @@ -1218,7 +1460,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { // %Base:vreg_64 = // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, - MemAddress &Addr) { + MemAddress &Addr) const { if (!Base.isReg()) return; @@ -1273,15 +1515,16 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineInstr &MI, MemInfoMap &Visited, - SmallPtrSet<MachineInstr *, 4> &AnchorList) { + SmallPtrSet<MachineInstr *, 4> &AnchorList) const { + + if (!(MI.mayLoad() ^ MI.mayStore())) + return false; // TODO: Support flat and scratch. - if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || - TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) return false; - // TODO: Support Store. - if (!MI.mayLoad()) + if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) return false; if (AnchorList.count(&MI)) @@ -1418,100 +1661,166 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; +void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, + std::list<std::list<CombineInfo> > &MergeableInsts) const { + for (std::list<CombineInfo> &AddrList : MergeableInsts) { + if (AddrList.front().hasSameBaseAddress(*CI.I) && + AddrList.front().InstClass == CI.InstClass) { + AddrList.emplace_back(CI); + return; + } + } + // Base address not found, so add a new list. + MergeableInsts.emplace_back(1, CI); +} + +bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, + std::list<std::list<CombineInfo> > &MergeableInsts) const { + bool Modified = false; // Contain the list MemInfoMap Visited; // Contains the list of instructions for which constant offsets are being // promoted to the IMM. SmallPtrSet<MachineInstr *, 4> AnchorList; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - + // Sort potential mergeable instructions into lists. One list per base address. + for (MachineInstr &MI : MBB.instrs()) { + // We run this before checking if an address is mergeable, because it can produce + // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); + if (InstClass == UNKNOWN) + continue; + // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; + if (MI.hasOrderedMemoryRef()) + continue; + + CombineInfo CI; + CI.setMI(MI, *TII, *STM); + + if (!CI.hasMergeableAddress(*MRI)) + continue; + + addInstToMergeableList(CI, MergeableInsts); + } + return Modified; +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock( + std::list<std::list<CombineInfo> > &MergeableInsts) { + bool Modified = false; + + for (std::list<CombineInfo> &MergeList : MergeableInsts) { + if (MergeList.size() < 2) + continue; + + bool OptimizeListAgain = false; + if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { + // We weren't able to make any changes, so clear the list so we don't + // process the same instructions the next time we try to optimize this + // block. + MergeList.clear(); continue; } - const unsigned Opc = MI.getOpcode(); + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) + MergeList.clear(); - CombineInfo CI; - CI.I = I; - CI.InstClass = getInstClass(Opc); + OptimizeAgain |= OptimizeListAgain; + Modified = true; + } + return Modified; +} + +void +SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList, + const MachineInstr &MI) { + + for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { + if (&*CI->I == &MI) { + MergeList.erase(CI); + return; + } + } +} + +bool +SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( + std::list<CombineInfo> &MergeList, + bool &OptimizeListAgain) { + bool Modified = false; + for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + CombineInfo &CI = *I; switch (CI.InstClass) { default: break; case DS_READ: - CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 - : 4; if (findMatchingInst(CI)) { Modified = true; - I = mergeRead2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case DS_WRITE: - CI.EltSize = - (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 - : 4; if (findMatchingInst(CI)) { Modified = true; - I = mergeWrite2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case S_BUFFER_LOAD_IMM: - CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); if (findMatchingInst(CI)) { Modified = true; - I = mergeSBufferLoadImmPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; } - continue; - case BUFFER_LOAD_OFFEN: - case BUFFER_LOAD_OFFSET: - case BUFFER_LOAD_OFFEN_exact: - case BUFFER_LOAD_OFFSET_exact: - CI.EltSize = 4; + break; + case BUFFER_LOAD: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; - case BUFFER_STORE_OFFEN: - case BUFFER_STORE_OFFSET: - case BUFFER_STORE_OFFEN_exact: - case BUFFER_STORE_OFFSET_exact: - CI.EltSize = 4; + break; + case BUFFER_STORE: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferStorePair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; + case MIMG: + if (findMatchingInst(CI)) { + Modified = true; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeImagePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + } + break; } - - ++I; + // Clear the InstsToMove after we have finished searching so we don't have + // stale values left over if we search for this CI again in another pass + // over the block. + CI.InstsToMove.clear(); } return Modified; @@ -1537,10 +1846,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; + for (MachineBasicBlock &MBB : MF) { + std::list<std::list<CombineInfo> > MergeableInsts; + // First pass: Collect list of all instructions we know how to merge. + Modified |= collectMergeableInsts(MBB, MergeableInsts); do { OptimizeAgain = false; - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MergeableInsts); } while (OptimizeAgain); } diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 78f409cd9555..6f9abd3a8d9b 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -98,6 +98,8 @@ private: void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); + Register getSaveExec(MachineInstr* MI); + void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const; @@ -144,7 +146,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, const SIInstrInfo *TII) { - unsigned SaveExecReg = MI.getOperand(0).getReg(); + Register SaveExecReg = MI.getOperand(0).getReg(); auto U = MRI->use_instr_nodbg_begin(SaveExecReg); if (U == MRI->use_instr_nodbg_end() || @@ -175,17 +177,31 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, return true; } +Register SILowerControlFlow::getSaveExec(MachineInstr *MI) { + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &SaveExec = MI->getOperand(0); + assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister); + + Register SaveExecReg = SaveExec.getReg(); + unsigned FalseTermOpc = + TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + MachineBasicBlock::iterator I = (MI); + MachineBasicBlock::iterator J = std::next(I); + if (J != MBB->end() && J->getOpcode() == FalseTermOpc && + J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) { + SaveExecReg = J->getOperand(0).getReg(); + J->eraseFromParent(); + } + return SaveExecReg; +} + void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - - MachineOperand &SaveExec = MI.getOperand(0); - MachineOperand &Cond = MI.getOperand(1); - assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && - Cond.getSubReg() == AMDGPU::NoSubRegister); - - Register SaveExecReg = SaveExec.getReg(); + Register SaveExecReg = getSaveExec(&MI); + MachineOperand& Cond = MI.getOperand(1); + assert(Cond.getSubReg() == AMDGPU::NoSubRegister); MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); @@ -204,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { .addReg(Exec) .addReg(Exec, RegState::ImplicitDefine); - unsigned Tmp = MRI->createVirtualRegister(BoolRC); + Register Tmp = MRI->createVirtualRegister(BoolRC); MachineInstr *And = BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) @@ -266,8 +282,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - Register DstReg = MI.getOperand(0).getReg(); - assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); + Register DstReg = getSaveExec(&MI); bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); @@ -339,7 +354,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - auto Dst = MI.getOperand(0).getReg(); + auto Dst = getSaveExec(&MI); // Skip ANDing with exec if the break condition is already masked by exec // because it is a V_CMP in the same basic block. (We know the break @@ -400,13 +415,17 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + unsigned CFMask = MI.getOperand(0).getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(CFMask); const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = MBB.begin(); - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); + MachineBasicBlock::iterator InsPt = + Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def)) + : MBB.begin(); + MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) + .addReg(Exec) + .add(MI.getOperand(0)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -422,7 +441,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const { MachineOperand &Op = MI.getOperand(OpNo); - if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) { Src.push_back(Op); return; } @@ -442,8 +461,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, for (const auto &SrcOp : Def->explicit_operands()) if (SrcOp.isReg() && SrcOp.isUse() && - (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || - SrcOp.getReg() == Exec)) + (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec)) Src.push_back(SrcOp); } @@ -466,7 +484,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; else return; - unsigned Reg = MI.getOperand(OpToReplace).getReg(); + Register Reg = MI.getOperand(OpToReplace).getReg(); MI.RemoveOperand(OpToReplace); MI.addOperand(Ops[UniqueOpndIdx]); if (MRI->use_empty(Reg)) diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index 1c0f836f07e6..b45412536356 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -96,7 +96,7 @@ private: getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; bool isVreg1(unsigned Reg) const { - return TargetRegisterInfo::isVirtualRegister(Reg) && + return Register::isVirtualRegister(Reg) && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; } @@ -489,6 +489,15 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { return true; } +#ifndef NDEBUG +static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + Register Reg) { + unsigned Size = TRI.getRegSizeInBits(Reg, MRI); + return Size == 1 || Size == 32; +} +#endif + void SILowerI1Copies::lowerCopiesFromI1() { SmallVector<MachineInstr *, 4> DeadCopies; @@ -497,8 +506,8 @@ void SILowerI1Copies::lowerCopiesFromI1() { if (MI.getOpcode() != AMDGPU::COPY) continue; - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); if (!isVreg1(SrcReg)) continue; @@ -509,7 +518,7 @@ void SILowerI1Copies::lowerCopiesFromI1() { LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); DebugLoc DL = MI.getDebugLoc(); - assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32); + assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); assert(!MI.getOperand(0).getSubReg()); ConstrainRegs.insert(SrcReg); @@ -544,7 +553,7 @@ void SILowerI1Copies::lowerPhis() { LF.initialize(MBB); for (MachineInstr &MI : MBB.phis()) { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (!isVreg1(DstReg)) continue; @@ -556,7 +565,7 @@ void SILowerI1Copies::lowerPhis() { // Collect incoming values. for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { assert(i + 1 < MI.getNumOperands()); - unsigned IncomingReg = MI.getOperand(i).getReg(); + Register IncomingReg = MI.getOperand(i).getReg(); MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); @@ -580,12 +589,12 @@ void SILowerI1Copies::lowerPhis() { // Phis in a loop that are observed outside the loop receive a simple but // conservatively correct treatment. - MachineBasicBlock *PostDomBound = &MBB; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) { - PostDomBound = - PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); - } + std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); unsigned FoundLoopLevel = LF.findLoop(PostDomBound); SSAUpdater.Initialize(DstReg); @@ -669,7 +678,7 @@ void SILowerI1Copies::lowerCopiesToI1() { MI.getOpcode() != AMDGPU::COPY) continue; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (!isVreg1(DstReg)) continue; @@ -686,10 +695,10 @@ void SILowerI1Copies::lowerCopiesToI1() { continue; DebugLoc DL = MI.getDebugLoc(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); assert(!MI.getOperand(1).getSubReg()); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + if (!Register::isVirtualRegister(SrcReg) || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); unsigned TmpReg = createLaneMaskReg(*MF); @@ -702,12 +711,12 @@ void SILowerI1Copies::lowerCopiesToI1() { // Defs in a loop that are observed outside the loop must be transformed // into appropriate bit manipulation. - MachineBasicBlock *PostDomBound = &MBB; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) { - PostDomBound = - PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); - } + std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); unsigned FoundLoopLevel = LF.findLoop(PostDomBound); if (FoundLoopLevel) { SSAUpdater.Initialize(DstReg); @@ -734,7 +743,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { break; Reg = MI->getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return false; if (!isLaneMaskReg(Reg)) return false; diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index a82047473370..714d403a3e8f 100644 --- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -278,8 +278,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr); int FI = MI.getOperand(FIOp).getIndex(); - unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata) - ->getReg(); + Register VReg = + TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 46da974a2f45..7dd0f11c95de 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); - Occupancy = getMaxWavesPerEU(); - limitOccupancy(MF); + Occupancy = ST.computeOccupancy(MF, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { @@ -190,7 +189,7 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass)); + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass)); NumUserSGPRs += 4; return ArgInfo.PrivateSegmentBuffer.getRegister(); } @@ -487,6 +486,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), + HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), @@ -501,8 +501,9 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { bool SIMachineFunctionInfo::initializeBaseYamlFields( const yaml::SIMachineFunctionInfo &YamlMFI) { ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; - MaxKernArgAlign = YamlMFI.MaxKernArgAlign; + MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign); LDSSize = YamlMFI.LDSSize; + HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; IsEntryFunction = YamlMFI.IsEntryFunction; NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; MemoryBound = YamlMFI.MemoryBound; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index f19b20ceb5da..7d70c786b594 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -265,6 +265,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool NoSignedZerosFPMath = false; bool MemoryBound = false; bool WaveLimiter = false; + uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; @@ -302,6 +303,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { StringValue("$sp_reg")); YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); YamlIO.mapOptional("mode", MFI.Mode, SIMode()); + YamlIO.mapOptional("highBitsOf32BitAddress", + MFI.HighBitsOf32BitAddress, 0u); } }; @@ -670,7 +673,7 @@ public: return GITPtrHigh; } - unsigned get32BitAddressHighBits() const { + uint32_t get32BitAddressHighBits() const { return HighBitsOf32BitAddress; } @@ -873,7 +876,7 @@ public: assert(BufferRsrc); auto PSV = BufferPSVs.try_emplace( BufferRsrc, - llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII)); + std::make_unique<AMDGPUBufferPseudoSourceValue>(TII)); return PSV.first->second.get(); } @@ -882,14 +885,14 @@ public: assert(ImgRsrc); auto PSV = ImagePSVs.try_emplace( ImgRsrc, - llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII)); + std::make_unique<AMDGPUImagePseudoSourceValue>(TII)); return PSV.first->second.get(); } const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { if (!GWSResourcePSV) { GWSResourcePSV = - llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII); + std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII); } return GWSResourcePSV.get(); diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index ebbdf80f9567..c072ba6b2d1c 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -348,7 +348,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // Do not Track Physical Registers, because it messes up. for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) + if (Register::isVirtualRegister(RegMaskPair.RegUnit)) LiveInRegs.insert(RegMaskPair.RegUnit); } LiveOutRegs.clear(); @@ -376,7 +376,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // The use of findDefBetween removes the case 4. for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { unsigned Reg = RegMaskPair.RegUnit; - if (TargetRegisterInfo::isVirtualRegister(Reg) && + if (Register::isVirtualRegister(Reg) && isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, LIS)) { @@ -1228,7 +1228,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria unsigned Color = CurrentColoring[SU->NodeNum]; if (RealID.find(Color) == RealID.end()) { int ID = CurrentBlocks.size(); - BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); + BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); RealID[Color] = ID; } @@ -1690,7 +1690,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { for (unsigned Reg : Regs) { // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; // If not already in the live set, then add it. (void) LiveRegs.insert(Reg); @@ -1750,7 +1750,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, for (unsigned Reg : InRegs) { // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; if (LiveRegsConsumers[Reg] > 1) continue; @@ -1762,7 +1762,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, for (unsigned Reg : OutRegs) { // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); for (; PSetI.isValid(); ++PSetI) { @@ -1801,7 +1801,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, // SIScheduleDAGMI // SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : - ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { + ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { SITII = static_cast<const SIInstrInfo*>(TII); SITRI = static_cast<const SIRegisterInfo*>(TRI); @@ -1913,7 +1913,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, for (_Iterator RegI = First; RegI != End; ++RegI) { unsigned Reg = *RegI; // For now only track virtual registers - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; PSetIterator PSetI = MRI.getPressureSets(Reg); for (; PSetI.isValid(); ++PSetI) { diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 4320e6c957a0..e914573306ae 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -656,10 +656,10 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) { std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return make_unique<SIGfx6CacheControl>(ST); + return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX10) - return make_unique<SIGfx7CacheControl>(ST); - return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); + return std::make_unique<SIGfx7CacheControl>(ST); + return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); } bool SIGfx6CacheControl::enableLoadCacheBypass( diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp index a5edd7b3554a..52989a280e80 100644 --- a/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/lib/Target/AMDGPU/SIModeRegister.cpp @@ -226,7 +226,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, // - on exit we have set the Require, Change, and initial Exit modes. void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII) { - auto NewInfo = llvm::make_unique<BlockData>(); + auto NewInfo = std::make_unique<BlockData>(); MachineInstr *InsertionPoint = nullptr; // RequirePending is used to indicate whether we are collecting the initial // requirements for the block, and need to defer the first InsertionPoint to diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 3227bff20513..cc9b46a75582 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -322,7 +322,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { continue; } - unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); + Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); MachineInstr *SaveExecInst = nullptr; SmallVector<MachineInstr *, 4> OtherUseInsts; diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 7e10316eab92..fdd30db6a7cb 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -211,7 +211,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, return AMDGPU::NoRegister; MachineOperand *AndCC = &And->getOperand(1); - unsigned CmpReg = AndCC->getReg(); + Register CmpReg = AndCC->getReg(); unsigned CmpSubReg = AndCC->getSubReg(); if (CmpReg == ExecReg) { AndCC = &And->getOperand(2); @@ -234,7 +234,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1) return AMDGPU::NoRegister; - unsigned SelReg = Op1->getReg(); + Register SelReg = Op1->getReg(); auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS); if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return AMDGPU::NoRegister; @@ -250,15 +250,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, Op1->getImm() != 0 || Op2->getImm() != 1) return AMDGPU::NoRegister; - LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' - << *Cmp << '\t' << *And); + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' + << *And); - unsigned CCReg = CC->getReg(); + Register CCReg = CC->getReg(); LIS->RemoveMachineInstrFromMaps(*And); - MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), - TII->get(Andn2Opc), And->getOperand(0).getReg()) - .addReg(ExecReg) - .addReg(CCReg, 0, CC->getSubReg()); + MachineInstr *Andn2 = + BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), + And->getOperand(0).getReg()) + .addReg(ExecReg) + .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg()); And->eraseFromParent(); LIS->InsertMachineInstrInMaps(*Andn2); @@ -266,20 +267,19 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, // Try to remove compare. Cmp value should not used in between of cmp // and s_and_b64 if VCC or just unused if any other register. - if ((TargetRegisterInfo::isVirtualRegister(CmpReg) && - MRI.use_nodbg_empty(CmpReg)) || + if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) || (CmpReg == CondReg && std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(), [&](const MachineInstr &MI) { - return MI.readsRegister(CondReg, TRI); }))) { + return MI.readsRegister(CondReg, TRI); + }))) { LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n'); LIS->RemoveMachineInstrFromMaps(*Cmp); Cmp->eraseFromParent(); // Try to remove v_cndmask_b32. - if (TargetRegisterInfo::isVirtualRegister(SelReg) && - MRI.use_nodbg_empty(SelReg)) { + if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) { LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); LIS->RemoveMachineInstrFromMaps(*Sel); @@ -413,7 +413,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (!SaveExec || !SaveExec->isFullCopy()) continue; - unsigned SavedExec = SaveExec->getOperand(0).getReg(); + Register SavedExec = SaveExec->getOperand(0).getReg(); bool SafeToReplace = true; for (auto& U : MRI.use_nodbg_instructions(SavedExec)) { if (U.getParent() != SaveExec->getParent()) { @@ -434,7 +434,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (Changed) { for (auto Reg : RecalcRegs) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { LIS->removeInterval(Reg); if (!MRI.reg_empty(Reg)) LIS->createAndComputeVirtRegInterval(Reg); diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 2d71abc0612a..9b3b2436475c 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -574,16 +574,16 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(Src1->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || Opcode == AMDGPU::V_LSHLREV_B32_e64) { - return make_unique<SDWADstOperand>( + return std::make_unique<SDWADstOperand>( Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); } else { - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, Opcode != AMDGPU::V_LSHRREV_B32_e32 && Opcode != AMDGPU::V_LSHRREV_B32_e64); @@ -613,15 +613,15 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(Src1->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || Opcode == AMDGPU::V_LSHLREV_B16_e64) { - return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); + return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); } else { - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( Src1, Dst, BYTE_1, false, false, Opcode != AMDGPU::V_LSHRREV_B16_e32 && Opcode != AMDGPU::V_LSHRREV_B16_e64); @@ -677,11 +677,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src0->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(Src0->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); } @@ -706,11 +706,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(ValSrc->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(ValSrc->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); } @@ -840,7 +840,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); assert(OrDst && OrDst->isReg()); - return make_unique<SDWADstPreserveOperand>( + return std::make_unique<SDWADstPreserveOperand>( OrDst, OrSDWADef, OrOtherDef, DstSel); } @@ -1189,7 +1189,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, continue; } - unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), VGPR); if (Op.isImm()) diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index f9bfe96f65cb..6cdd12d0e7bd 100644 --- a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -90,12 +90,12 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { if (!MO.isReg()) return false; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!TRI->isVGPR(*MRI, Reg)) return false; - if (TRI->isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return false; if (VRM->hasPhys(Reg)) @@ -124,14 +124,14 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { if (!MO.isReg()) continue; - const unsigned VirtReg = MO.getReg(); - if (TRI->isPhysicalRegister(VirtReg)) + const Register VirtReg = MO.getReg(); + if (Register::isPhysicalRegister(VirtReg)) continue; if (!VRM->hasPhys(VirtReg)) continue; - unsigned PhysReg = VRM->getPhys(VirtReg); + Register PhysReg = VRM->getPhys(VirtReg); const unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { PhysReg = TRI->getSubReg(PhysReg, SubReg); @@ -149,7 +149,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { for (unsigned Reg : RegsToRewrite) { LIS->removeInterval(Reg); - const unsigned PhysReg = VRM->getPhys(Reg); + const Register PhysReg = VRM->getPhys(Reg); assert(PhysReg != 0); MFI->ReserveWWMRegister(PhysReg); } diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h index 168f05f8fdd6..7c039a54b57f 100644 --- a/lib/Target/AMDGPU/SIProgramInfo.h +++ b/lib/Target/AMDGPU/SIProgramInfo.h @@ -41,6 +41,8 @@ struct SIProgramInfo { uint64_t ComputePGMRSrc2 = 0; uint32_t NumVGPR = 0; + uint32_t NumArchVGPR = 0; + uint32_t NumAccVGPR = 0; uint32_t NumSGPR = 0; uint32_t LDSSize = 0; bool FlatUsed = false; @@ -51,6 +53,9 @@ struct SIProgramInfo { // Number of VGPRs that meets number of waves per execution unit request. uint32_t NumVGPRsForWavesPerEU = 0; + // Final occupancy. + uint32_t Occupancy = 0; + // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. bool DynamicCallStack = false; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index f152deb28004..f58bc3060c42 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, } } -static cl::opt<bool> EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(false)); - static cl::opt<bool> EnableSpillSGPRToVGPR( "amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling VGPRs to SGPRs"), @@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR( SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), + ST(ST), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), AGPRPressureSets(getNumRegPressureSets()), - SpillSGPRToVGPR(false), - SpillSGPRToSMEM(false), + SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { - if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) - SpillSGPRToSMEM = true; - else if (EnableSpillSGPRToVGPR) - SpillSGPRToVGPR = true; - unsigned NumRegPressureSets = getNumRegPressureSets(); SGPRSetID = NumRegPressureSets; @@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { @@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } @@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::VCC_HI); } - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { @@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } + // Reserve all the rest AGPRs if there are no instructions to use it. + if (!ST.hasMAIInsts()) { + for (unsigned i = 0; i < MaxNumVGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); @@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const bool SIRegisterInfo::requiresFrameIndexScavenging( const MachineFunction &MF) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasStackObjects()) - return true; - - // May need to deal with callee saved registers. - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return !Info->isEntryFunction(); + // Do not use frame virtual registers. They used to be used for SGPRs, but + // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the + // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a + // spill. + return false; } bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return false; - - // The scavenger is used for large frames which may require finding a free - // register for large offsets. - if (!isUInt<12>(MFI.getStackSize())) - return true; - - // If using scalar stores, for spills, m0 is needed for the scalar store - // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual - // register for it during frame index elimination, so the scavenger is - // directly needed. - return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && - MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); + return MFI.hasStackObjects(); } bool SIRegisterInfo::requiresVirtualBaseRegisters( @@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, DL = Ins->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = Subtarget.getInstrInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); if (Offset == 0) { BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) @@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, } MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); @@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const { - - MachineBasicBlock *MBB = MI.getParent(); - MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = Subtarget.getInstrInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); #ifndef NDEBUG // FIXME: Is it possible to be storing a frame index to itself? @@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, #endif MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); +#ifndef NDEBUG + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); +#endif assert(FIOp && FIOp->isFI() && "frame index must be address operand"); assert(TII->isMUBUF(MI)); assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == - MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && - "should only be seeing frame offset relative FrameIndex"); - + MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() && + "should only be seeing stack pointer offset relative FrameIndex"); MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } -static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, +static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, + MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, @@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); @@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. -static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, +static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset) { + const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = MI->mayStore(); @@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) + if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) return true; MachineInstrBuilder NewMI = @@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { - unsigned SubReg = NumSubRegs == 1 ? - ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); + Register SubReg = NumSubRegs == 1 + ? Register(ValueReg) + : getSubReg(ValueReg, getSubRegFromChannel(i)); unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); @@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); + auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill); if (!MIB.getInstr()) { unsigned FinalReg = SubReg; @@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) @@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } } -static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, - bool Store) { - if (SuperRegSize % 16 == 0) { - return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; - } - - if (SuperRegSize % 8 == 0) { - return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; - } - - return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; -} - bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (OnlyToVGPR && !SpillToVGPR) return false; - MachineRegisterInfo &MRI = MF->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned SuperReg = MI->getOperand(0).getReg(); + Register SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; - - Register FrameReg = getFrameRegister(*MF); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - - unsigned ScalarStoreOp; unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, true); - } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + Register TmpVGPR; + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToSMEM) { - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - - // The allocated memory size is really the wavefront size * the frame - // index size. The widest register class is 64 bytes, so a 4-byte scratch - // allocation is enough to spill this in a single stack object. - // - // FIXME: Frame size/offsets are computed earlier than this, so the extra - // space is still unnecessarily allocated. - - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - EltSize, MinAlign(Align, EltSize * i)); - - // SMEM instructions only support a single offset, so increment the wave - // offset. - - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) - .addReg(SubReg, getKillRegState(IsKill)) // sdata - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - continue; - } + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, return false; // Spill SGPR to a frame index. - // TODO: Should VI try to spill to VGPR and then spill to SMEM? - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // TODO: Should VI try to spill to VGPR and then spill to SMEM? + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(SubReg, SubKillState); - // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { @@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src + .addReg(TmpVGPR, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, RegScavenger *RS, bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, return false; MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); - unsigned SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; + Register SuperReg = MI->getOperand(0).getReg(); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - unsigned EltSize = 4; - unsigned ScalarLoadOp; - - Register FrameReg = getFrameRegister(*MF); const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, false); - } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - // SubReg carries the "Kill" flag when SubReg == SuperReg. - int64_t FrOffset = FrameInfo.getObjectOffset(Index); + Register TmpVGPR; for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToSMEM) { - // FIXME: Size may be > 4 but extra bytes wasted. - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, - EltSize, MinAlign(Align, EltSize * i)); - - // Add i * 4 offset - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); - - continue; - } + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo @@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpReg, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // In an entry function/kernel the offset is already the absolute // address relative to the frame register. - unsigned DiffReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register TmpDiffReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + + // If there's no free SGPR, in-place modify the FP + Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; Register ResultReg = IsCopy ? MI->getOperand(0).getReg() : - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) .addReg(FrameReg) @@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) - .addImm(Log2_32(ST.getWavefrontSize())) + .addImm(ST.getWavefrontSizeLog2()) .addReg(DiffReg); } else { - unsigned ScaledReg - = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { + Register ScaledReg = + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) - .addImm(Log2_32(ST.getWavefrontSize())) - .addReg(DiffReg, RegState::Kill); + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), + ScaledReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(DiffReg, RegState::Kill); - // TODO: Fold if use instruction is another add of a constant. - if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) - .addImm(Offset) - .addReg(ScaledReg, RegState::Kill) - .addImm(0); // clamp bit + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + + // TODO: Fold if use instruction is another add of a constant. + if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + // FIXME: This can fail + MIB.addImm(Offset); + MIB.addReg(ScaledReg, RegState::Kill); + if (!IsVOP2) + MIB.addImm(0); // clamp bit + } else { + Register ConstOffsetReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false); + + // This should always be able to use the unused carry out. + assert(ConstOffsetReg && "this scavenge should not be able to fail"); + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) + .addImm(Offset); + MIB.addReg(ConstOffsetReg, RegState::Kill); + MIB.addReg(ScaledReg, RegState::Kill); + MIB.addImm(0); // clamp bit + } } else { - unsigned ConstOffsetReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + // We have to produce a carry out, and we there isn't a free SGPR + // pair for it. We can keep the whole computation on the SALU to + // avoid clobbering an additional register at the cost of an extra + // mov. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) - .addImm(Offset); - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) - .addReg(ConstOffsetReg, RegState::Kill) + // We may have 1 free scratch SGPR even though a carry out is + // unavailable. Only one additional mov is needed. + Register TmpScaledReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(ST.getWavefrontSizeLog2()); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) - .addImm(0); // clamp bit + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) + .addReg(ScaledReg, RegState::Kill); + + // If there were truly no free SGPRs, we need to undo everything. + if (!TmpScaledReg.isValid()) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) + .addReg(ScaledReg, RegState::Kill) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(ST.getWavefrontSizeLog2()); + } } } + if (!TmpDiffReg.isValid()) { + // Restore the FP. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) + .addReg(FrameReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t NewOffset = OldImm + Offset; if (isUInt<12>(NewOffset) && - buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { + buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); return; } @@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); @@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { - const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); - unsigned Size = getRegSizeInBits(*RC); - unsigned AltName = AMDGPU::NoRegAltName; - - switch (Size) { - case 32: AltName = AMDGPU::Reg32; break; - case 64: AltName = AMDGPU::Reg64; break; - case 96: AltName = AMDGPU::Reg96; break; - case 128: AltName = AMDGPU::Reg128; break; - case 160: AltName = AMDGPU::Reg160; break; - case 256: AltName = AMDGPU::Reg256; break; - case 512: AltName = AMDGPU::Reg512; break; - case 1024: AltName = AMDGPU::Reg1024; break; - } - return AMDGPUInstPrinter::getRegisterName(Reg, AltName); + return AMDGPUInstPrinter::getRegisterName(Reg); } // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + assert(!Register::isVirtualRegister(Reg)); static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, @@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { unsigned Size = getRegSizeInBits(*RC); - if (Size < 32) - return false; switch (Size) { case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; @@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; case 1024: return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + case 1: + return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; default: - llvm_unreachable("Invalid register class size"); + assert(Size < 32 && "Invalid register class size"); + return false; } } @@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_512RegClass; case 1024: return &AMDGPU::VReg_1024RegClass; + case 1: + return &AMDGPU::VReg_1RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( case 96: return &AMDGPU::SReg_96RegClass; case 128: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 160: return &AMDGPU::SReg_160RegClass; case 256: @@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( case 3: return &AMDGPU::SReg_96RegClass; case 4: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 5: return &AMDGPU::SReg_160RegClass; case 8: @@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return !ST.hasMFMAInlineLiteralBug(); + + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; +} + bool SIRegisterInfo::shouldRewriteCopySrc( const TargetRegisterClass *DefRC, unsigned DefSubReg, @@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC const TargetRegisterClass* SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const { - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); return getPhysRegClass(Reg); @@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), @@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; case AMDGPU::SGPRRegBankID: - return &AMDGPU::SReg_32_XM0RegClass; + return &AMDGPU::SReg_32RegClass; case AMDGPU::SCCRegBankID: // This needs to return an allocatable class, so don't bother returning // the dummy SCC class. - return &AMDGPU::SReg_32_XM0RegClass; + // + // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this + // was not an VCC bank value since we use the larger class SReg_32 for + // other values. These should all use SReg_32. + return &AMDGPU::SGPR_32RegClass; default: llvm_unreachable("unknown register bank"); } } case 32: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + &AMDGPU::SReg_32RegClass; case 64: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : &AMDGPU::SReg_64_XEXECRegClass; @@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, &AMDGPU::SReg_96RegClass; case 128: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : - &AMDGPU::SReg_128RegClass; + &AMDGPU::SGPR_128RegClass; case 160: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : &AMDGPU::SReg_160RegClass; @@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, case 512: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : &AMDGPU::SReg_512RegClass; + case 1024: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : + &AMDGPU::SReg_1024RegClass; default: if (Size < 32) return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + &AMDGPU::SReg_32RegClass; return nullptr; } } @@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, const TargetRegisterClass * SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const { - if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) + const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); + if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); - return nullptr; + + const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); + return getAllocatableClass(RC); } unsigned SIRegisterInfo::getVCC() const { @@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { if (!LIS->hasInterval(Reg)) return nullptr; LiveInterval &LI = LIS->getInterval(Reg); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 34487c96e72e..ac3dea1a1a28 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -27,6 +27,7 @@ class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { private: + const GCNSubtarget &ST; unsigned SGPRSetID; unsigned VGPRSetID; unsigned AGPRSetID; @@ -34,7 +35,6 @@ private: BitVector VGPRPressureSets; BitVector AGPRPressureSets; bool SpillSGPRToVGPR; - bool SpillSGPRToSMEM; bool isWave32; void classifyPressureSet(unsigned PSetID, unsigned Reg, @@ -46,10 +46,6 @@ public: return SpillSGPRToVGPR; } - bool spillSGPRToSMEM() const { - return SpillSGPRToSMEM; - } - /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; @@ -141,7 +137,7 @@ public: bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { const TargetRegisterClass *RC; - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) RC = MRI.getRegClass(Reg); else RC = getPhysRegClass(Reg); @@ -193,10 +189,7 @@ public: /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const { - return OpType >= AMDGPU::OPERAND_SRC_FIRST && - OpType <= AMDGPU::OPERAND_SRC_LAST; - } + bool opCanUseInlineConstant(unsigned OpType) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, @@ -270,7 +263,7 @@ public: const MachineRegisterInfo &MRI) const override; const TargetRegisterClass *getBoolRC() const { - return isWave32 ? &AMDGPU::SReg_32_XM0RegClass + return isWave32 ? &AMDGPU::SReg_32RegClass : &AMDGPU::SReg_64RegClass; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index d5948a7862cc..82219cbdf3b2 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -37,50 +37,52 @@ class getSubRegs<int size> { !if(!eq(size, 16), ret16, ret32)))))); } -let Namespace = "AMDGPU" in { -defset list<RegAltNameIndex> AllRegAltNameIndices = { - def Reg32 : RegAltNameIndex; - def Reg64 : RegAltNameIndex; - def Reg96 : RegAltNameIndex; - def Reg128 : RegAltNameIndex; - def Reg160 : RegAltNameIndex; - def Reg256 : RegAltNameIndex; - def Reg512 : RegAltNameIndex; - def Reg1024 : RegAltNameIndex; +// Generates list of sequential register tuple names. +// E.g. RegSeq<3,2,2,"s">.ret -> [ "s[0:1]", "s[2:3]" ] +class RegSeqNames<int last_reg, int stride, int size, string prefix, + int start = 0> { + int next = !add(start, stride); + int end_reg = !add(!add(start, size), -1); + list<string> ret = + !if(!le(end_reg, last_reg), + !listconcat([prefix # "[" # start # ":" # end_reg # "]"], + RegSeqNames<last_reg, stride, size, prefix, next>.ret), + []); } + +// Generates list of dags for register tupless. +class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size, + int start = 0> { + dag trunc_rc = (trunc RC, + !if(!and(!eq(stride, 1), !eq(start, 0)), + !add(!add(last_reg, 2), !mul(size, -1)), + !add(last_reg, 1))); + list<dag> ret = + !if(!lt(start, size), + !listconcat([(add (decimate (shl trunc_rc, start), stride))], + RegSeqDags<RC, last_reg, stride, size, !add(start, 1)>.ret), + []); } +class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC, + int last_reg, int stride, int size, string prefix> : + RegisterTuples<Indices, + RegSeqDags<RC, last_reg, stride, size>.ret, + RegSeqNames<last_reg, stride, size, prefix>.ret>; + //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// -class SIReg <string n, bits<16> regIdx = 0, string prefix = "", - int regNo = !cast<int>(regIdx)> : - Register<n, !if(!eq(prefix, ""), - [ n, n, n, n, n, n, n, n ], - [ prefix # regNo, - prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]", - ])>, +class SIReg <string n, bits<16> regIdx = 0> : + Register<n>, DwarfRegNum<[!cast<int>(HWEncoding)]> { let Namespace = "AMDGPU"; - let RegAltNameIndices = AllRegAltNameIndices; // This is the not yet the complete register encoding. An additional // bit is set for VGPRs. let HWEncoding = regIdx; } -class SIRegisterWithSubRegs<string n, list<Register> subregs> : - RegisterWithSubRegs<n, subregs> { - let RegAltNameIndices = AllRegAltNameIndices; - let AltNames = [ n, n, n, n, n, n, n, n ]; -} - // Special Registers def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; @@ -93,7 +95,7 @@ def SP_REG : SIReg<"sp", 0>; def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // VCC for 64-bit instructions -def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias<VCC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -103,7 +105,7 @@ def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, +def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegAlias<EXEC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -134,7 +136,7 @@ def LDS_DIRECT : SIReg <"src_lds_direct", 254>; def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; -def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, +def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, DwarfRegAlias<XNACK_MASK_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -145,7 +147,7 @@ def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_ def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; -def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, DwarfRegAlias<TBA_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -155,7 +157,7 @@ def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, def TMA_LO : SIReg<"tma_lo", 110>; def TMA_HI : SIReg<"tma_hi", 111>; -def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, DwarfRegAlias<TMA_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -175,7 +177,7 @@ multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { } class FlatReg <Register lo, Register hi, bits<16> encoding> : - SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>, + RegisterWithSubRegs<"flat_scratch", [lo, hi]>, DwarfRegAlias<lo> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -191,19 +193,19 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; // SGPR registers foreach Index = 0-105 in { - def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">; + def SGPR#Index : SIReg <"s"#Index, Index>; } // VGPR registers foreach Index = 0-255 in { - def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> { + def VGPR#Index : SIReg <"v"#Index, Index> { let HWEncoding{8} = 1; } } // AccVGPR registers foreach Index = 0-255 in { - def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> { + def AGPR#Index : SIReg <"a"#Index, Index> { let HWEncoding{8} = 1; } } @@ -226,102 +228,32 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "SGPR%u", 0, 105)), Reg32> { + (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. let AllocationPriority = 9; } // SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret, - [(add (decimate SGPR_32, 2)), - (add (decimate (shl SGPR_32, 1), 2))]>; +def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">; // SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. -def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret, - [(add (decimate SGPR_32, 3)), - (add (decimate (shl SGPR_32, 1), 3)), - (add (decimate (shl SGPR_32, 2), 3))]>; +def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">; // SGPR 128-bit registers -def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4))]>; +def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">; // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. -def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4))]>; +def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">; // SGPR 256-bit registers -def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4))]>; +def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">; // SGPR 512-bit registers -def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4)), - (add (decimate (shl SGPR_32, 8), 4)), - (add (decimate (shl SGPR_32, 9), 4)), - (add (decimate (shl SGPR_32, 10), 4)), - (add (decimate (shl SGPR_32, 11), 4)), - (add (decimate (shl SGPR_32, 12), 4)), - (add (decimate (shl SGPR_32, 13), 4)), - (add (decimate (shl SGPR_32, 14), 4)), - (add (decimate (shl SGPR_32, 15), 4))]>; +def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">; // SGPR 1024-bit registers -def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4)), - (add (decimate (shl SGPR_32, 8), 4)), - (add (decimate (shl SGPR_32, 9), 4)), - (add (decimate (shl SGPR_32, 10), 4)), - (add (decimate (shl SGPR_32, 11), 4)), - (add (decimate (shl SGPR_32, 12), 4)), - (add (decimate (shl SGPR_32, 13), 4)), - (add (decimate (shl SGPR_32, 14), 4)), - (add (decimate (shl SGPR_32, 15), 4)), - (add (decimate (shl SGPR_32, 16), 4)), - (add (decimate (shl SGPR_32, 17), 4)), - (add (decimate (shl SGPR_32, 18), 4)), - (add (decimate (shl SGPR_32, 19), 4)), - (add (decimate (shl SGPR_32, 20), 4)), - (add (decimate (shl SGPR_32, 21), 4)), - (add (decimate (shl SGPR_32, 22), 4)), - (add (decimate (shl SGPR_32, 23), 4)), - (add (decimate (shl SGPR_32, 24), 4)), - (add (decimate (shl SGPR_32, 25), 4)), - (add (decimate (shl SGPR_32, 26), 4)), - (add (decimate (shl SGPR_32, 27), 4)), - (add (decimate (shl SGPR_32, 28), 4)), - (add (decimate (shl SGPR_32, 29), 4)), - (add (decimate (shl SGPR_32, 30), 4)), - (add (decimate (shl SGPR_32, 31), 4))]>; +def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">; // Trap handler TMP 32-bit registers def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, @@ -330,51 +262,21 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, } // Trap handler TMP 64-bit registers -def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret, - [(add (decimate TTMP_32, 2)), - (add (decimate (shl TTMP_32, 1), 2))]>; +def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">; // Trap handler TMP 128-bit registers -def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret, - [(add (decimate TTMP_32, 4)), - (add (decimate (shl TTMP_32, 1), 4)), - (add (decimate (shl TTMP_32, 2), 4)), - (add (decimate (shl TTMP_32, 3), 4))]>; +def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">; -def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret, - [(add (decimate TTMP_32, 4)), - (add (decimate (shl TTMP_32, 1), 4)), - (add (decimate (shl TTMP_32, 2), 4)), - (add (decimate (shl TTMP_32, 3), 4)), - (add (decimate (shl TTMP_32, 4), 4)), - (add (decimate (shl TTMP_32, 5), 4)), - (add (decimate (shl TTMP_32, 6), 4)), - (add (decimate (shl TTMP_32, 7), 4))]>; +def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">; -def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret, - [(add (decimate TTMP_32, 4)), - (add (decimate (shl TTMP_32, 1), 4)), - (add (decimate (shl TTMP_32, 2), 4)), - (add (decimate (shl TTMP_32, 3), 4)), - (add (decimate (shl TTMP_32, 4), 4)), - (add (decimate (shl TTMP_32, 5), 4)), - (add (decimate (shl TTMP_32, 6), 4)), - (add (decimate (shl TTMP_32, 7), 4)), - (add (decimate (shl TTMP_32, 8), 4)), - (add (decimate (shl TTMP_32, 9), 4)), - (add (decimate (shl TTMP_32, 10), 4)), - (add (decimate (shl TTMP_32, 11), 4)), - (add (decimate (shl TTMP_32, 12), 4)), - (add (decimate (shl TTMP_32, 13), 4)), - (add (decimate (shl TTMP_32, 14), 4)), - (add (decimate (shl TTMP_32, 15), 4))]>; +def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">; class TmpRegTuplesBase<int index, int size, list<Register> subRegs, list<SubRegIndex> indices = getSubRegs<size>.ret, int index1 = !add(index, !add(size, -1)), string name = "ttmp["#index#":"#index1#"]"> : - SIRegisterWithSubRegs<name, subRegs> { + RegisterWithSubRegs<name, subRegs> { let HWEncoding = subRegs[0].HWEncoding; let SubRegIndices = indices; } @@ -448,196 +350,80 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>; +class RegisterTypes<list<ValueType> reg_types> { + list<ValueType> types = reg_types; +} + +def Reg16Types : RegisterTypes<[i16, f16]>; +def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; + + // VGPR 32-bit registers // i16/f16 only on VI+ -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "VGPR%u", 0, 255)), Reg32> { +def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, + (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; } // VGPR 64-bit registers -def VGPR_64 : RegisterTuples<getSubRegs<2>.ret, - [(add (trunc VGPR_32, 255)), - (add (shl VGPR_32, 1))]>; +def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">; // VGPR 96-bit registers -def VGPR_96 : RegisterTuples<getSubRegs<3>.ret, - [(add (trunc VGPR_32, 254)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2))]>; +def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">; // VGPR 128-bit registers -def VGPR_128 : RegisterTuples<getSubRegs<4>.ret, - [(add (trunc VGPR_32, 253)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3))]>; +def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">; // VGPR 160-bit registers -def VGPR_160 : RegisterTuples<getSubRegs<5>.ret, - [(add (trunc VGPR_32, 252)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4))]>; +def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">; // VGPR 256-bit registers -def VGPR_256 : RegisterTuples<getSubRegs<8>.ret, - [(add (trunc VGPR_32, 249)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7))]>; +def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">; // VGPR 512-bit registers -def VGPR_512 : RegisterTuples<getSubRegs<16>.ret, - [(add (trunc VGPR_32, 241)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7)), - (add (shl VGPR_32, 8)), - (add (shl VGPR_32, 9)), - (add (shl VGPR_32, 10)), - (add (shl VGPR_32, 11)), - (add (shl VGPR_32, 12)), - (add (shl VGPR_32, 13)), - (add (shl VGPR_32, 14)), - (add (shl VGPR_32, 15))]>; +def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">; // VGPR 1024-bit registers -def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret, - [(add (trunc VGPR_32, 225)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7)), - (add (shl VGPR_32, 8)), - (add (shl VGPR_32, 9)), - (add (shl VGPR_32, 10)), - (add (shl VGPR_32, 11)), - (add (shl VGPR_32, 12)), - (add (shl VGPR_32, 13)), - (add (shl VGPR_32, 14)), - (add (shl VGPR_32, 15)), - (add (shl VGPR_32, 16)), - (add (shl VGPR_32, 17)), - (add (shl VGPR_32, 18)), - (add (shl VGPR_32, 19)), - (add (shl VGPR_32, 20)), - (add (shl VGPR_32, 21)), - (add (shl VGPR_32, 22)), - (add (shl VGPR_32, 23)), - (add (shl VGPR_32, 24)), - (add (shl VGPR_32, 25)), - (add (shl VGPR_32, 26)), - (add (shl VGPR_32, 27)), - (add (shl VGPR_32, 28)), - (add (shl VGPR_32, 29)), - (add (shl VGPR_32, 30)), - (add (shl VGPR_32, 31))]>; +def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">; // AccVGPR 32-bit registers def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "AGPR%u", 0, 255)), Reg32> { + (add (sequence "AGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; } // AGPR 64-bit registers -def AGPR_64 : RegisterTuples<getSubRegs<2>.ret, - [(add (trunc AGPR_32, 255)), - (add (shl AGPR_32, 1))]>; +def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">; // AGPR 128-bit registers -def AGPR_128 : RegisterTuples<getSubRegs<4>.ret, - [(add (trunc AGPR_32, 253)), - (add (shl AGPR_32, 1)), - (add (shl AGPR_32, 2)), - (add (shl AGPR_32, 3))]>; +def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">; // AGPR 512-bit registers -def AGPR_512 : RegisterTuples<getSubRegs<16>.ret, - [(add (trunc AGPR_32, 241)), - (add (shl AGPR_32, 1)), - (add (shl AGPR_32, 2)), - (add (shl AGPR_32, 3)), - (add (shl AGPR_32, 4)), - (add (shl AGPR_32, 5)), - (add (shl AGPR_32, 6)), - (add (shl AGPR_32, 7)), - (add (shl AGPR_32, 8)), - (add (shl AGPR_32, 9)), - (add (shl AGPR_32, 10)), - (add (shl AGPR_32, 11)), - (add (shl AGPR_32, 12)), - (add (shl AGPR_32, 13)), - (add (shl AGPR_32, 14)), - (add (shl AGPR_32, 15))]>; +def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">; // AGPR 1024-bit registers -def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret, - [(add (trunc AGPR_32, 225)), - (add (shl AGPR_32, 1)), - (add (shl AGPR_32, 2)), - (add (shl AGPR_32, 3)), - (add (shl AGPR_32, 4)), - (add (shl AGPR_32, 5)), - (add (shl AGPR_32, 6)), - (add (shl AGPR_32, 7)), - (add (shl AGPR_32, 8)), - (add (shl AGPR_32, 9)), - (add (shl AGPR_32, 10)), - (add (shl AGPR_32, 11)), - (add (shl AGPR_32, 12)), - (add (shl AGPR_32, 13)), - (add (shl AGPR_32, 14)), - (add (shl AGPR_32, 15)), - (add (shl AGPR_32, 16)), - (add (shl AGPR_32, 17)), - (add (shl AGPR_32, 18)), - (add (shl AGPR_32, 19)), - (add (shl AGPR_32, 20)), - (add (shl AGPR_32, 21)), - (add (shl AGPR_32, 22)), - (add (shl AGPR_32, 23)), - (add (shl AGPR_32, 24)), - (add (shl AGPR_32, 25)), - (add (shl AGPR_32, 26)), - (add (shl AGPR_32, 27)), - (add (shl AGPR_32, 28)), - (add (shl AGPR_32, 29)), - (add (shl AGPR_32, 30)), - (add (shl AGPR_32, 31))]>; +def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">; //===----------------------------------------------------------------------===// // Register classes used as source and destination //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> { + (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { let isAllocatable = 0; let CopyCost = -1; } def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, - (add PRIVATE_RSRC_REG), Reg128> { + (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; } def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add LDS_DIRECT), Reg32> { + (add LDS_DIRECT)> { let isAllocatable = 0; let CopyCost = -1; } @@ -648,41 +434,40 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1 (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, - SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> { + SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { let AllocationPriority = 10; } def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> { + (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { let AllocationPriority = 10; } def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> { + (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 10; } // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> { + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 10; } def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS), - Reg32> { + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, - (add SGPR_64Regs), Reg64> { + (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 11; } // CCR (call clobbered registers) SGPR 64-bit registers def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, - (add (trunc SGPR_64, 16)), Reg64> { + (add (trunc SGPR_64, 16))> { let CopyCost = SGPR_64.CopyCost; let AllocationPriority = SGPR_64.AllocationPriority; } @@ -693,13 +478,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, } def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> { + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 13; } def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SReg_64_XEXEC, EXEC), Reg64> { + (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 13; } @@ -722,17 +507,17 @@ let CopyCost = 2 in { // There are no 3-component scalar instructions, but this is needed // for symmetry with VGPRs. def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, - (add SGPR_96Regs), Reg96> { + (add SGPR_96Regs)> { let AllocationPriority = 14; } def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, - (add SGPR_96), Reg96> { + (add SGPR_96)> { let AllocationPriority = 14; } def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, - (add SGPR_128Regs), Reg128> { + (add SGPR_128Regs)> { let AllocationPriority = 15; } @@ -742,8 +527,9 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, } def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add SGPR_128, TTMP_128), Reg128> { + (add SGPR_128, TTMP_128)> { let AllocationPriority = 15; + let isAllocatable = 0; } } // End CopyCost = 2 @@ -751,17 +537,16 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, // There are no 5-component scalar instructions, but this is needed // for symmetry with VGPRs. def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add SGPR_160Regs), Reg160> { + (add SGPR_160Regs)> { let AllocationPriority = 16; } def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add SGPR_160), Reg160> { + (add SGPR_160)> { let AllocationPriority = 16; } -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs), - Reg256> { +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { let AllocationPriority = 17; } @@ -770,14 +555,14 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { } def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add SGPR_256, TTMP_256), Reg256> { + (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; let AllocationPriority = 17; } def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add SGPR_512Regs), Reg512> { + (add SGPR_512Regs)> { let AllocationPriority = 18; } @@ -787,31 +572,31 @@ def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, } def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add SGPR_512, TTMP_512), Reg512> { + (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; let AllocationPriority = 18; } def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, LDS_DIRECT_CLASS), Reg32> { + (add VGPR_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add SGPR_1024Regs), Reg1024> { + (add SGPR_1024Regs)> { let AllocationPriority = 19; } def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add SGPR_1024), Reg1024> { + (add SGPR_1024)> { let CopyCost = 16; let AllocationPriority = 19; } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, - (add VGPR_64), Reg64> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32, + (add VGPR_64)> { let Size = 64; // Requires 2 v_mov_b32 to copy @@ -819,7 +604,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32 let AllocationPriority = 2; } -def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> { +def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> { let Size = 96; // Requires 3 v_mov_b32 to copy @@ -828,7 +613,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> } def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add VGPR_128), Reg128> { + (add VGPR_128)> { let Size = 128; // Requires 4 v_mov_b32 to copy @@ -837,7 +622,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, } def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add VGPR_160), Reg160> { + (add VGPR_160)> { let Size = 160; // Requires 5 v_mov_b32 to copy @@ -846,28 +631,28 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, } def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add VGPR_256), Reg256> { + (add VGPR_256)> { let Size = 256; let CopyCost = 8; let AllocationPriority = 6; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add VGPR_512), Reg512> { + (add VGPR_512)> { let Size = 512; let CopyCost = 16; let AllocationPriority = 7; } def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add VGPR_1024), Reg1024> { + (add VGPR_1024)> { let Size = 1024; let CopyCost = 32; let AllocationPriority = 8; } def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, - (add AGPR_64), Reg64> { + (add AGPR_64)> { let Size = 64; let CopyCost = 5; @@ -875,7 +660,7 @@ def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32 } def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add AGPR_128), Reg128> { + (add AGPR_128)> { let Size = 128; // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr @@ -884,40 +669,39 @@ def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, } def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add AGPR_512), Reg512> { + (add AGPR_512)> { let Size = 512; let CopyCost = 33; let AllocationPriority = 7; } def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add AGPR_1024), Reg1024> { + (add AGPR_1024)> { let Size = 1024; let CopyCost = 65; let AllocationPriority = 8; } -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> { - let Size = 32; +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 1; } def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> { + (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64), - Reg64> { +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { let isAllocatable = 0; } def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add AGPR_32, VGPR_32), Reg32> { + (add AGPR_32, VGPR_32)> { let isAllocatable = 0; } def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, - (add AReg_64, VReg_64), Reg64> { + (add AReg_64, VReg_64)> { let isAllocatable = 0; } diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 7ee178149c7a..8afca2cdc325 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -77,8 +77,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, // Try to fold Src0 MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg()) { - unsigned Reg = Src0.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { + Register Reg = Src0.getReg(); + if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); @@ -360,8 +360,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, } if (NewImm != 0) { - if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && - SrcReg->isReg()) { + if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) { MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); return true; @@ -394,12 +393,11 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, if (!MO.isReg()) continue; - if (TargetRegisterInfo::isPhysicalRegister(Reg) && - TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (Register::isPhysicalRegister(Reg) && + Register::isPhysicalRegister(MO.getReg())) { if (TRI.regsOverlap(Reg, MO.getReg())) return true; - } else if (MO.getReg() == Reg && - TargetRegisterInfo::isVirtualRegister(Reg)) { + } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) { LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & TRI.getSubRegIndexLaneMask(MO.getSubReg()); if (Overlap.any()) @@ -425,7 +423,7 @@ static TargetInstrInfo::RegSubRegPair getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { if (TRI.getRegSizeInBits(Reg, MRI) != 32) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (Register::isPhysicalRegister(Reg)) { Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); } else { LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); @@ -459,13 +457,13 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || MovT.getOpcode() == AMDGPU::COPY); - unsigned T = MovT.getOperand(0).getReg(); + Register T = MovT.getOperand(0).getReg(); unsigned Tsub = MovT.getOperand(0).getSubReg(); MachineOperand &Xop = MovT.getOperand(1); if (!Xop.isReg()) return nullptr; - unsigned X = Xop.getReg(); + Register X = Xop.getReg(); unsigned Xsub = Xop.getSubReg(); unsigned Size = TII->getOpSize(MovT, 0) / 4; @@ -484,7 +482,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, MovY.getOperand(1).getSubReg() != Tsub) continue; - unsigned Y = MovY.getOperand(0).getReg(); + Register Y = MovY.getOperand(0).getReg(); unsigned Ysub = MovY.getOperand(0).getSubReg(); if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) @@ -579,7 +577,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // XXX - not exactly a check for post-regalloc run. MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && - TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { + Register::isPhysicalRegister(MI.getOperand(0).getReg())) { int32_t ReverseImm; if (isReverseInlineImm(TII, Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); @@ -643,8 +641,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // FIXME: This could work better if hints worked with subregisters. If // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. - if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && - Src0->isReg()) { + if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) { MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; @@ -672,8 +669,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &Dst = MI.getOperand(0); MachineOperand &Src = MI.getOperand(1); - if (Src.isImm() && - TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { + if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) { int32_t ReverseImm; if (isKImmOperand(TII, Src)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); @@ -721,8 +717,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - unsigned DstReg = MI.getOperand(0).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + Register DstReg = MI.getOperand(0).getReg(); + if (Register::isVirtualRegister(DstReg)) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because this is only one register and // cannot deal with sequences which would require multiple copies of @@ -745,8 +741,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (!Src2->isReg()) continue; - unsigned SReg = Src2->getReg(); - if (TargetRegisterInfo::isVirtualRegister(SReg)) { + Register SReg = Src2->getReg(); + if (Register::isVirtualRegister(SReg)) { MRI.setRegAllocationHint(SReg, 0, VCCReg); continue; } @@ -766,7 +762,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { bool Next = false; if (SDst->getReg() != VCCReg) { - if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) + if (Register::isVirtualRegister(SDst->getReg())) MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); Next = true; } @@ -774,7 +770,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // All of the instructions with carry outs also have an SGPR input in // src2. if (Src2 && Src2->getReg() != VCCReg) { - if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) + if (Register::isVirtualRegister(Src2->getReg())) MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); Next = true; } diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 4e07efff55d8..cb4cf68d709a 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -273,12 +273,12 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, if (!Use.isReg() || !Use.isUse()) continue; - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Register::isVirtualRegister(Reg)) { if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) continue; @@ -312,6 +312,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, char GlobalFlags = 0; bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); SmallVector<MachineInstr *, 4> SetInactiveInstrs; + SmallVector<MachineInstr *, 4> SoftWQMInstrs; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -340,6 +341,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // correct, so we need it to be in WQM. Flags = StateWQM; LowerToCopyInstrs.push_back(&MI); + } else if (Opcode == AMDGPU::SOFT_WQM) { + LowerToCopyInstrs.push_back(&MI); + SoftWQMInstrs.push_back(&MI); + continue; } else if (Opcode == AMDGPU::WWM) { // The WWM intrinsic doesn't make the same guarantee, and plus it needs // to be executed in WQM or Exact so that its copy doesn't clobber @@ -356,8 +361,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (Inactive.isUndef()) { LowerToCopyInstrs.push_back(&MI); } else { - unsigned Reg = Inactive.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + Register Reg = Inactive.getReg(); + if (Register::isVirtualRegister(Reg)) { for (MachineInstr &DefMI : MRI->def_instructions(Reg)) markInstruction(DefMI, StateWWM, Worklist); } @@ -385,9 +390,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); - if (!TRI->isVirtualRegister(Reg) && + if (!Register::isVirtualRegister(Reg) && TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; @@ -407,9 +412,12 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is // ever used anywhere in the function. This implements the corresponding // semantics of @llvm.amdgcn.set.inactive. + // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. if (GlobalFlags & StateWQM) { for (MachineInstr *MI : SetInactiveInstrs) markInstruction(*MI, StateWQM, Worklist); + for (MachineInstr *MI : SoftWQMInstrs) + markInstruction(*MI, StateWQM, Worklist); } return GlobalFlags; @@ -548,7 +556,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { - unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineInstr *Save = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) @@ -832,7 +840,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); - unsigned Dest = MI->getOperand(0).getReg(); + Register Dest = MI->getOperand(0).getReg(); MachineInstr *Copy = BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) .addReg(LiveMaskReg); @@ -847,13 +855,12 @@ void SIWholeQuadMode::lowerCopyInstrs() { for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) MI->RemoveOperand(i); - const unsigned Reg = MI->getOperand(0).getReg(); + const Register Reg = MI->getOperand(0).getReg(); if (TRI->isVGPR(*MRI, Reg)) { - const TargetRegisterClass *regClass = - TargetRegisterInfo::isVirtualRegister(Reg) - ? MRI->getRegClass(Reg) - : TRI->getPhysRegClass(Reg); + const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg) + ? MRI->getRegClass(Reg) + : TRI->getPhysRegClass(Reg); const unsigned MovOp = TII->getMovOpcode(regClass); MI->setDesc(TII->get(MovOp)); @@ -885,7 +892,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM)) + if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty()) return !LiveMaskQueries.empty(); } else { // Store a copy of the original live mask when required diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 1b410b6b5912..1a74ebbf8165 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -793,9 +793,18 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { // selector to prefer those. let AddedComplexity = 100 in { -defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +foreach vt = Reg32Types.types in { +defm : SMRD_Pattern <"S_LOAD_DWORD", vt>; +} + +foreach vt = SReg_64.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>; +} + +foreach vt = SReg_128.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>; +} + defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index dfafdccc05a3..d31a49f428ee 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -181,7 +181,9 @@ def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (ctpop i32:$src0))] >; -def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">; +def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64", + [(set i32:$sdst, (ctpop i64:$src0))] +>; } // End Defs = [SCC] def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; @@ -417,16 +419,16 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32", let isCommutable = 1 in { def S_MIN_I32 : SOP2_32 <"s_min_i32", - [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))] + [(set i32:$sdst, (smin i32:$src0, i32:$src1))] >; def S_MIN_U32 : SOP2_32 <"s_min_u32", - [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))] + [(set i32:$sdst, (umin i32:$src0, i32:$src1))] >; def S_MAX_I32 : SOP2_32 <"s_max_i32", - [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))] + [(set i32:$sdst, (smax i32:$src0, i32:$src1))] >; def S_MAX_U32 : SOP2_32 <"s_max_u32", - [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))] + [(set i32:$sdst, (umax i32:$src0, i32:$src1))] >; } // End isCommutable = 1 } // End Defs = [SCC] @@ -853,13 +855,13 @@ class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1, let Defs = [SCC]; } class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC_Base < + string opName, SDPatternOperator cond> : SOPC_Base < op, rc, rc, opName, [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { } class SOPC_CMP_32<bits<7> op, string opName, - PatLeaf cond = COND_NULL, string revOp = opName> + SDPatternOperator cond = COND_NULL, string revOp = opName> : SOPC_Helper<op, SSrc_b32, i32, opName, cond>, Commutable_REV<revOp, !eq(revOp, opName)>, SOPKInstTable<0, opName> { @@ -868,7 +870,7 @@ class SOPC_CMP_32<bits<7> op, string opName, } class SOPC_CMP_64<bits<7> op, string opName, - PatLeaf cond = COND_NULL, string revOp = opName> + SDPatternOperator cond = COND_NULL, string revOp = opName> : SOPC_Helper<op, SSrc_b64, i64, opName, cond>, Commutable_REV<revOp, !eq(revOp, opName)> { let isCompare = 1; @@ -1076,8 +1078,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_amdgcn_s_barrier)]> { let SchedRW = [WriteBarrier]; let simm16 = 0; - let mayLoad = 1; - let mayStore = 1; let isConvergent = 1; } @@ -1090,7 +1090,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16", - [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>; + [(int_amdgcn_s_waitcnt timm:$simm16)]>; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; @@ -1099,7 +1099,7 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; // maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the // maximum really 15 on VI? def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), - "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { + "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; @@ -1110,12 +1110,11 @@ def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; let Uses = [EXEC, M0] in { // FIXME: Should this be mayLoad+mayStore? def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", - [(AMDGPUsendmsg (i32 imm:$simm16))] ->; + [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>; def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16", - [(AMDGPUsendmsghalt (i32 imm:$simm16))] ->; + [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>; + } // End Uses = [EXEC, M0] def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> { @@ -1126,13 +1125,13 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; } def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16", - [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> { + [(int_amdgcn_s_incperflevel timm:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; } def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16", - [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> { + [(int_amdgcn_s_decperflevel timm:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; @@ -1169,7 +1168,10 @@ let SubtargetPredicate = isGFX10Plus in { def S_ROUND_MODE : SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; def S_DENORM_MODE : - SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">; + SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16", + [(SIdenorm_mode (i32 timm:$simm16))]> { + let hasSideEffects = 1; + } def S_TTRACEDATA_IMM : SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; } // End SubtargetPredicate = isGFX10Plus @@ -1178,7 +1180,7 @@ let SubtargetPredicate = isGFX10Plus in { // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// def : GCNPat < - (int_amdgcn_s_getreg imm:$simm16), + (int_amdgcn_s_getreg timm:$simm16), (S_GETREG_B32 (as_i16imm $simm16)) >; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e90f40e6abea..afb2fd987afd 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -131,29 +131,70 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { struct MUBUFInfo { uint16_t Opcode; uint16_t BaseOpcode; - uint8_t dwords; + uint8_t elements; bool has_vaddr; bool has_srsrc; bool has_soffset; }; +struct MTBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t elements; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MTBUFInfoTable_DECL +#define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" +int getMTBUFBaseOpcode(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; +} + +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); + return Info ? Info->Opcode : -1; +} + +int getMTBUFElements(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->elements : 0; +} + +bool getMTBUFHasVAddr(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} + +bool getMTBUFHasSrsrc(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} + +bool getMTBUFHasSoffset(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} + int getMUBUFBaseOpcode(unsigned Opc) { const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); return Info ? Info->BaseOpcode : -1; } -int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) { - const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords); +int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); return Info ? Info->Opcode : -1; } -int getMUBUFDwords(unsigned Opc) { +int getMUBUFElements(unsigned Opc) { const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); - return Info ? Info->dwords : 0; + return Info ? Info->elements : 0; } bool getMUBUFHasVAddr(unsigned Opc) { @@ -241,7 +282,7 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, } unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) { - return getMaxWavesPerEU() * getEUsPerCU(STI); + return getMaxWavesPerEU(STI) * getEUsPerCU(STI); } unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, @@ -253,9 +294,11 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { return 1; } -unsigned getMaxWavesPerEU() { +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. - return 10; + if (!isGFX10(*STI)) + return 10; + return 20; } unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, @@ -317,7 +360,7 @@ unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { if (Version.Major >= 10) return 0; - if (WavesPerEU >= getMaxWavesPerEU()) + if (WavesPerEU >= getMaxWavesPerEU(STI)) return 0; unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1); @@ -394,17 +437,19 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { - return 256; + if (!isGFX10(*STI)) + return 256; + return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { - return getTotalNumVGPRs(STI); + return 256; } unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU()) + if (WavesPerEU >= getMaxWavesPerEU(STI)) return 0; unsigned MinNumVGPRs = alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1), @@ -510,7 +555,7 @@ bool isReadOnlySegment(const GlobalValue *GV) { } bool shouldEmitConstantsToTextSection(const Triple &TT) { - return TT.getOS() != Triple::AMDHSA; + return TT.getOS() == Triple::AMDPAL; } int getIntegerAttribute(const Function &F, StringRef Name, int Default) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 209ef7eef749..f78dadd447ff 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -94,7 +94,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per execution unit for given subtarget \p /// STI without any kind of limitation. -unsigned getMaxWavesPerEU(); +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per execution unit for given subtarget \p /// STI and limited by given \p FlatWorkGroupSize. @@ -264,13 +264,31 @@ LLVM_READONLY const MIMGInfo *getMIMGInfo(unsigned Opc); LLVM_READONLY +int getMTBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements); + +LLVM_READONLY +int getMTBUFElements(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSoffset(unsigned Opc); + +LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); LLVM_READONLY -int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords); +int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements); LLVM_READONLY -int getMUBUFDwords(unsigned Opc); +int getMUBUFElements(unsigned Opc); LLVM_READONLY bool getMUBUFHasVAddr(unsigned Opc); diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index db20d5ccf5f9..207e4232e829 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -21,6 +21,8 @@ #include "SIDefines.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/EndianStream.h" diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 6bc416ed7d4b..f1cdc3097dc0 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -104,9 +104,21 @@ multiclass VOP1Inst <string opName, VOPProfile P, SDPatternOperator node = null_frag> { def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; - def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP1_DPP_Pseudo <opName, P>; + + def : MnemonicAlias<opName#"_e32", opName>, LetDummies; + def : MnemonicAlias<opName#"_e64", opName>, LetDummies; + + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies; + + foreach _ = BoolToList<P.HasExtDPP>.ret in + def : MnemonicAlias<opName#"_dpp", opName>, LetDummies; } // Special profile for instructions which have clamp @@ -227,10 +239,10 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; } // End SchedRW = [WriteQuarterRate32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>; -defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>; +defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>; -defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; @@ -434,7 +446,7 @@ let SubtargetPredicate = isGFX10Plus in { // Target-specific instruction encodings. //===----------------------------------------------------------------------===// -class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : +class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : VOP_DPP<ps.OpName, p, isDPP16> { let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; @@ -448,8 +460,9 @@ class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = let Inst{31-25} = 0x3f; } -class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : - VOP1_DPP<op, ps, p, 1> { +class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP1_DPP<op, ps, p, 1>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> { let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); let SubtargetPredicate = HasDPP16; } @@ -492,6 +505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP1_Real_sdwa_gfx10<bits<9> op> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { @@ -499,11 +513,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP1_Real_dpp_gfx10<bits<9> op> { - def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> { let DecoderNamespace = "SDWA10"; } } multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -704,10 +720,12 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> { multiclass VOP1_Real_vi <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in def _sdwa_vi : VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -831,25 +849,25 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>; def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; -let OtherPredicates = [isGFX8GFX9] in { +let OtherPredicates = [isGFX8Plus] in { def : GCNPat < - (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl)), + (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, + timm:$bound_ctrl)), (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, - imm:$bank_mask, imm:$bound_ctrl)), + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; -} // End OtherPredicates = [isGFX8GFX9] +} // End OtherPredicates = [isGFX8Plus] let OtherPredicates = [isGFX8Plus] in { def : GCNPat< @@ -885,6 +903,7 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; } + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -904,23 +923,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; let OtherPredicates = [isGFX10Plus] in { def : GCNPat < - (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)), + (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) >; - -def : GCNPat < - (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl)), - (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl), (i32 0)) ->; - -def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, - imm:$bank_mask, imm:$bound_ctrl)), - (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl), (i32 0)) ->; } // End OtherPredicates = [isGFX10Plus] diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 1b30cd2ed516..1ab0fc1ab58d 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -147,7 +147,8 @@ multiclass VOP2Inst_sdwa<string opName, string revOp = opName, bit GFX9Renamed = 0> { let renamedInGFX9 = GFX9Renamed in { - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; } // End renamedInGFX9 = GFX9Renamed } @@ -179,9 +180,10 @@ multiclass VOP2bInst <string opName, let usesCustomInserter = !eq(P.NumSrcArgs, 2); } - def _sdwa : VOP2_SDWA_Pseudo <opName, P> { - let AsmMatchConverter = "cvtSdwaVOP2b"; - } + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP2_DPP_Pseudo <opName, P>; } @@ -220,9 +222,10 @@ multiclass VOP2eInst <string opName, def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P> { - let AsmMatchConverter = "cvtSdwaVOP2b"; - } + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2e"; + } foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP2_DPP_Pseudo <opName, P>; @@ -251,7 +254,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> { class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); - field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); + field dag Ins32 = !if(!eq(vt.Size, 32), + (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm), + (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm)); field bit HasExt = 0; // Hack to stop printing _e64 @@ -519,7 +524,7 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, } // End isConvergent = 1 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; -defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>; defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>; defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; @@ -539,9 +544,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>; } // End isCommutable = 1 } // End SubtargetPredicate = isGFX6GFX7GFX10 @@ -606,9 +611,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; } // End FPDPRounding = 1 -defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; -defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; -defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; +defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>; +defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>; +defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>; let isCommutable = 1 in { let FPDPRounding = 1 in { @@ -618,16 +623,16 @@ defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; +defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>; +defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; -defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; -defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; -defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; -defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; -defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; +defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>; +defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>; +defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>; +defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { @@ -653,16 +658,17 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1, - isCommutable = 1 in { + isCommutable = 1, + IsDOT = 1 in { let SubtargetPredicate = HasDot5Insts in - defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; + defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; let SubtargetPredicate = HasDot6Insts in - defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; + defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; let SubtargetPredicate = HasDot4Insts in - defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; + defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; let SubtargetPredicate = HasDot3Insts in - defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; + defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; } let AddedComplexity = 30 in { @@ -719,50 +725,17 @@ defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; // Note: 16-bit instructions produce a 0 result in the high 16-bits // on GFX8 and GFX9 and preserve high 16 bits on GFX10+ -def ClearHI16 : OutPatFrag<(ops node:$op), - (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>; - -multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst, - bit PreservesHI16 = 0> { - -def : GCNPat< - (op i16:$src0, i16:$src1), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) ->; - -def : GCNPat< - (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) ->; - -def : GCNPat< - (i64 (zext (op i16:$src0, i16:$src1))), - (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)), - sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; -} - -multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst, - bit PreservesHI16 = 0> { - -def : GCNPat< - (op i16:$src0, i16:$src1), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) ->; +multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> { def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) + (inst $src0, $src1) >; - def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)), - sub0, + (inst $src0, $src1), sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } @@ -774,53 +747,36 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat < $src) >; -let Predicates = [Has16BitInsts] in { - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { -defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; -defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; -defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>; -defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>; -defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>; -defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>; -defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>; -} - -let Predicates = [Has16BitInsts, isGFX10Plus] in { -defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>; -defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>; -defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>; -defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>; -defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>; -defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>; -defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>; -} - +foreach vt = [i16, v2i16] in { def : GCNPat < - (and i16:$src0, i16:$src1), - (V_AND_B32_e64 $src0, $src1) + (and vt:$src0, vt:$src1), + (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1) >; def : GCNPat < - (or i16:$src0, i16:$src1), - (V_OR_B32_e64 $src0, $src1) + (or vt:$src0, vt:$src1), + (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1) >; def : GCNPat < - (xor i16:$src0, i16:$src1), - (V_XOR_B32_e64 $src0, $src1) + (xor vt:$src0, vt:$src1), + (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1) >; - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { -defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>; -defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>; -defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>; } -let Predicates = [Has16BitInsts, isGFX10Plus] in { -defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>; -defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>; -defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>; +let Predicates = [Has16BitInsts] in { + +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { +defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>; +defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>; +defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>; +defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>; +defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>; } def : ZExt_i16_i1_Pat<zext>; @@ -847,7 +803,7 @@ def : GCNPat< // Target-specific instruction encodings. //===----------------------------------------------------------------------===// -class VOP2_DPP<bits<6> op, VOP2_Pseudo ps, +class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl, bit IsDPP16 = 0> : VOP_DPP<opName, p, IsDPP16> { @@ -865,13 +821,18 @@ class VOP2_DPP<bits<6> op, VOP2_Pseudo ps, let Inst{31} = 0x0; } -class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps, +class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl> : VOP2_DPP<op, ps, opName, p, 1> { let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); let SubtargetPredicate = HasDPP16; } +class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + Base_VOP2_DPP16<op, ps, opName, p>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>; + class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl> : VOP_DPP8<ps.OpName, p> { @@ -924,6 +885,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP2_Real_sdwa_gfx10<bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { @@ -931,11 +893,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP2_Real_dpp_gfx10<bits<6> op> { - def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { let DecoderNamespace = "SDWA10"; } } multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -964,6 +928,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let DecoderNamespace = "SDWA10" in { multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -973,13 +938,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName, string asmName> { - def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16; } } multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; @@ -989,13 +956,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } // End DecoderNamespace = "SDWA10" //===------------------------------ VOP2be ------------------------------===// - multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> { + multiclass VOP2be_Real_e32_gfx10<bits<6> op, string opName, string asmName> { def _e32_gfx10 : VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>, VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> { VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); } + } + multiclass VOP2be_Real_e64_gfx10<bits<6> op, string opName, string asmName> { def _e64_gfx10 : VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>, VOP3be_gfx10<{0, 1, 0, 0, op{5-0}}, @@ -1003,6 +972,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64"); let AsmString = asmName # Ps.AsmOperands; } + } + multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -1010,64 +982,76 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); let DecoderNamespace = "SDWA10"; } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_w32_gfx10 : + Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_w64_gfx10 : + Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # Ps.AsmOperands; + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + let WaveSizePredicate = isWave64; + } + } + multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in def _dpp_gfx10 : - VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; let AsmString = asmName # !subst(", vcc", "", AsmDPP); let DecoderNamespace = "SDWA10"; } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_w32_gfx10 : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_w64_gfx10 : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); let DecoderNamespace = "DPP8"; } - - let WaveSizePredicate = isWave32 in { - def _sdwa_w32_gfx10 : - Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, - VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); - let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); - let isAsmParserOnly = 1; - let DecoderNamespace = "SDWA10"; - } - def _dpp_w32_gfx10 : - VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); - let isAsmParserOnly = 1; - } - def _dpp8_w32_gfx10 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); - let isAsmParserOnly = 1; - } - } // End WaveSizePredicate = isWave32 - - let WaveSizePredicate = isWave64 in { - def _sdwa_w64_gfx10 : - Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, - VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); - let AsmString = asmName # Ps.AsmOperands; - let isAsmParserOnly = 1; - let DecoderNamespace = "SDWA10"; - } - def _dpp_w64_gfx10 : - VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # AsmDPP; - let isAsmParserOnly = 1; - } - def _dpp8_w64_gfx10 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # AsmDPP8; - let isAsmParserOnly = 1; - } - } // End WaveSizePredicate = isWave64 + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_w32_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_w64_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } } //===----------------------------- VOP3Only -----------------------------===// @@ -1088,8 +1072,19 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" -multiclass Base_VOP2_Real_gfx10<bits<6> op> : - VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>; +multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> : + VOP2be_Real_e32_gfx10<op, opName, asmName>, + VOP2be_Real_e64_gfx10<op, opName, asmName>, + VOP2be_Real_sdwa_gfx10<op, opName, asmName>, + VOP2be_Real_dpp_gfx10<op, opName, asmName>, + VOP2be_Real_dpp8_gfx10<op, opName, asmName>; + +multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> : + VOP2_Real_e32_gfx10<op>, + VOP2_Real_e64_gfx10<op>, + VOP2be_Real_sdwa_gfx10<op, opName, asmName>, + VOP2be_Real_dpp_gfx10<op, opName, asmName>, + VOP2be_Real_dpp8_gfx10<op, opName, asmName>; multiclass VOP2_Real_gfx10<bits<6> op> : VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>, @@ -1103,7 +1098,6 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName, VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>, VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>; -defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>; defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>; defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>; @@ -1136,6 +1130,9 @@ defm V_SUB_CO_CI_U32 : defm V_SUBREV_CO_CI_U32 : VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; +defm V_CNDMASK_B32 : + VOP2e_Real_gfx10<0x001, "V_CNDMASK_B32", "v_cndmask_b32">; + // VOP3 only. defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>; defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>; @@ -1322,12 +1319,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : } // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" multiclass VOP2_SDWA_Real <bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } multiclass VOP2_SDWA9_Real <bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -1350,12 +1349,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName let AsmString = AsmName # ps.AsmOperands; let DecoderNamespace = "GFX8"; } - def _sdwa_vi : - VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, - VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); - let AsmString = AsmName # ps.AsmOperands; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in + def _sdwa_vi : + VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, + VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); + let AsmString = AsmName # ps.AsmOperands; + } foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in def _dpp_vi : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>, @@ -1383,12 +1383,13 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { let AsmString = AsmName # ps.AsmOperands; let DecoderNamespace = "GFX9"; } - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, - VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); - let AsmString = AsmName # ps.AsmOperands; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); + let AsmString = AsmName # ps.AsmOperands; + } foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>, @@ -1410,10 +1411,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { let DecoderNamespace = "GFX9"; } - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, - VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + } foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, @@ -1554,7 +1556,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; } // End SubtargetPredicate = HasDLInsts multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> { - def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; + def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> : diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 21dbef9240e1..605425972b1c 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -112,7 +112,7 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> { class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> { list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, - imm:$cbsz, imm:$abid, imm:$blgp))]; + timm:$cbsz, timm:$abid, timm:$blgp))]; } class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> : @@ -385,12 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3 } let SchedRW = [Write64Bit] in { -let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in { -def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>; -def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>; -def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>; +let SubtargetPredicate = isGFX6GFX7GFX10 in { +def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>; +def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>; +def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>; def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] +} // End SubtargetPredicate = isGFX6GFX7GFX10 let SubtargetPredicate = isGFX8Plus in { def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>; @@ -399,21 +399,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] -let Predicates = [isGFX8Plus] in { -def : GCNPat < - (getDivergentFrag<shl>.ret i64:$x, i32:$y), - (V_LSHLREV_B64 $y, $x) ->; -def : AMDGPUPat < - (getDivergentFrag<srl>.ret i64:$x, i32:$y), - (V_LSHRREV_B64 $y, $x) ->; -def : AMDGPUPat < - (getDivergentFrag<sra>.ret i64:$x, i32:$y), - (V_ASHRREV_I64 $y, $x) ->; -} - let SchedRW = [Write32Bit] in { let SubtargetPredicate = isGFX8Plus in { @@ -468,13 +453,13 @@ let FPDPRounding = 1 in { def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; let Uses = [M0, EXEC] in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, - [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan), - (i32 imm:$attr), - (i32 imm:$src0_modifiers), + [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 timm:$attrchan), + (i32 timm:$attr), + (i32 timm:$src0_modifiers), (f32 VRegSrc_32:$src2), - (i32 imm:$src2_modifiers), - (i1 imm:$high), - (i1 imm:$clamp)))]>; + (i32 timm:$src2_modifiers), + (i1 timm:$high), + (i1 timm:$clamp)))]>; } // End Uses = [M0, EXEC] } // End FPDPRounding = 1 } // End renamedInGFX9 = 1 @@ -493,21 +478,21 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1 let Uses = [M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, - [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan), - (i32 imm:$attr), - (i32 imm:$src0_modifiers), - (i1 imm:$high), - (i1 imm:$clamp), - (i32 imm:$omod)))]>; + [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan), + (i32 timm:$attr), + (i32 timm:$src0_modifiers), + (i1 timm:$high), + (i1 timm:$clamp), + (i32 timm:$omod)))]>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, - [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan), - (i32 imm:$attr), - (i32 imm:$src0_modifiers), + [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan), + (i32 timm:$attr), + (i32 timm:$src0_modifiers), (f32 VRegSrc_32:$src2), - (i32 imm:$src2_modifiers), - (i1 imm:$high), - (i1 imm:$clamp), - (i32 imm:$omod)))]>; + (i32 timm:$src2_modifiers), + (i1 timm:$high), + (i1 timm:$clamp), + (i32 timm:$omod)))]>; } // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -657,11 +642,11 @@ let SubtargetPredicate = isGFX10Plus in { } // End $vdst = $vdst_in, DisableEncoding $vdst_in def : GCNPat< - (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) >; def : GCNPat< - (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) >; } // End SubtargetPredicate = isGFX10Plus diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 55ee5f6577cf..0c13f39fec02 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -261,6 +261,7 @@ class SDot2Pat<Instruction Inst> : GCNPat < let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; } +let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; @@ -277,6 +278,7 @@ def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32 def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; } // End SubtargetPredicate = HasDot1Insts +} // End let IsDOT = 1 multiclass DotPats<SDPatternOperator dot_op, VOP3PInst dot_inst> { diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index b3513e383d10..8ef0ec7b71f4 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -183,7 +183,7 @@ multiclass VOPCXInstAliases <string OpName, string Arch> { } -class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies { +class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set i1:$sdst, (setcc (P.Src0VT @@ -202,7 +202,7 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> { multiclass VOPC_Pseudos <string opName, VOPC_Profile P, - PatLeaf cond = COND_NULL, + SDPatternOperator cond = COND_NULL, string revOp = opName, bit DefExec = 0> { @@ -225,6 +225,7 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } + foreach _ = BoolToList<P.HasExtSDWA>.ret in def _sdwa : VOPC_SDWA_Pseudo <opName, P> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; @@ -236,7 +237,7 @@ multiclass VOPC_Pseudos <string opName, let SubtargetPredicate = HasSdstCMPX in { multiclass VOPCX_Pseudos <string opName, VOPC_Profile P, VOPC_Profile P_NoSDst, - PatLeaf cond = COND_NULL, + SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, P, cond, revOp, 1> { @@ -261,6 +262,7 @@ multiclass VOPCX_Pseudos <string opName, let SubtargetPredicate = HasNoSdstCMPX; } + foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; @@ -285,22 +287,23 @@ def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>; def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>; def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>; -multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL, + string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>; -multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>; -multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>; -multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>; -multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>; -multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; multiclass VOPCX_F16 <string opName, string revOp = opName> : @@ -669,6 +672,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, let SchedRW = p.Schedule; } + foreach _ = BoolToList<p.HasExtSDWA>.ret in def _sdwa : VOPC_SDWA_Pseudo <opName, p> { let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), !if(DefVcc, [VCC], [])); @@ -698,6 +702,7 @@ multiclass VOPCX_Class_Pseudos <string opName, let SubtargetPredicate = HasNoSdstCMPX; } + foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; @@ -737,8 +742,11 @@ defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">; + +let SubtargetPredicate = Has16BitInsts in { defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; +} //===----------------------------------------------------------------------===// // V_ICMPIntrinsic Pattern. @@ -878,6 +886,7 @@ let AssemblerPredicate = isGFX10Plus in { } } // End DecoderNamespace = "GFX10" + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -903,6 +912,7 @@ let AssemblerPredicate = isGFX10Plus in { } } // End DecoderNamespace = "GFX10" + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>, VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> { @@ -1223,10 +1233,12 @@ multiclass VOPC_Real_vi <bits<10> op> { } } + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in def _sdwa_vi : VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 677095a354be..f208a1134a5a 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -14,6 +14,7 @@ class LetDummies { bit isReMaterializable; bit isAsCheapAsAMove; bit VOPAsmPrefer32Bit; + bit FPDPRounding; Predicate SubtargetPredicate; string Constraints; string DisableEncoding; @@ -41,9 +42,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern>, VOP <opName>, - SIMCInstr <opName#suffix, SIEncodingFamily.NONE>, - MnemonicAlias<opName#suffix, opName> { - + SIMCInstr <opName#suffix, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; let UseNamedOperandTable = 1; @@ -148,6 +147,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; + let OtherPredicates = ps.OtherPredicates; let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let Constraints = ps.Constraints; @@ -473,8 +473,7 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>, VOP <opName>, - SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>, - MnemonicAlias <opName#"_sdwa", opName> { + SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -595,8 +594,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 { class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>, VOP <OpName>, - SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>, - MnemonicAlias <OpName#"_dpp", OpName> { + SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h index 41b559d16761..9242400fb28d 100644 --- a/lib/Target/ARC/ARCFrameLowering.h +++ b/lib/Target/ARC/ARCFrameLowering.h @@ -27,8 +27,8 @@ class ARCInstrInfo; class ARCFrameLowering : public TargetFrameLowering { public: ARCFrameLowering(const ARCSubtarget &st) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0), ST(st) { - } + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0), + ST(st) {} /// Insert Prologue into the function. void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/lib/Target/ARC/ARCISelLowering.cpp b/lib/Target/ARC/ARCISelLowering.cpp index 847d23f0abdb..751fd567bae8 100644 --- a/lib/Target/ARC/ARCISelLowering.cpp +++ b/lib/Target/ARC/ARCISelLowering.cpp @@ -716,7 +716,7 @@ SDValue ARCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); assert(cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0 && "Only support lowering frame addr of current frame."); - unsigned FrameReg = ARI.getFrameRegister(MF); + Register FrameReg = ARI.getFrameRegister(MF); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); } diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.h b/lib/Target/ARC/ARCMachineFunctionInfo.h index 31aa5b93246c..d4dcf9bf285c 100644 --- a/lib/Target/ARC/ARCMachineFunctionInfo.h +++ b/lib/Target/ARC/ARCMachineFunctionInfo.h @@ -34,8 +34,8 @@ public: explicit ARCFunctionInfo(MachineFunction &MF) : ReturnStackOffsetSet(false), VarArgsFrameIndex(0), ReturnStackOffset(-1U), MaxCallStackReq(0) { - // Functions are 4-byte (2**2) aligned. - MF.setAlignment(2); + // Functions are 4-byte aligned. + MF.setAlignment(Align(4)); } ~ARCFunctionInfo() {} diff --git a/lib/Target/ARC/ARCOptAddrMode.cpp b/lib/Target/ARC/ARCOptAddrMode.cpp index c922b99c57b0..22a3b9111c8e 100644 --- a/lib/Target/ARC/ARCOptAddrMode.cpp +++ b/lib/Target/ARC/ARCOptAddrMode.cpp @@ -139,8 +139,7 @@ static bool dominatesAllUsesOf(const MachineInstr *MI, unsigned VReg, MachineDominatorTree *MDT, MachineRegisterInfo *MRI) { - assert(TargetRegisterInfo::isVirtualRegister(VReg) && - "Expected virtual register!"); + assert(Register::isVirtualRegister(VReg) && "Expected virtual register!"); for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end(); it != end; ++it) { @@ -181,7 +180,7 @@ static bool isLoadStoreThatCanHandleDisplacement(const TargetInstrInfo *TII, bool ARCOptAddrMode::noUseOfAddBeforeLoadOrStore(const MachineInstr *Add, const MachineInstr *Ldst) { - unsigned R = Add->getOperand(0).getReg(); + Register R = Add->getOperand(0).getReg(); return dominatesAllUsesOf(Ldst, R, MDT, MRI); } @@ -205,9 +204,8 @@ MachineInstr *ARCOptAddrMode::tryToCombine(MachineInstr &Ldst) { return nullptr; } - unsigned B = Base.getReg(); - if (TargetRegisterInfo::isStackSlot(B) || - !TargetRegisterInfo::isVirtualRegister(B)) { + Register B = Base.getReg(); + if (Register::isStackSlot(B) || !Register::isVirtualRegister(B)) { LLVM_DEBUG(dbgs() << "[ABAW] Base is not VReg\n"); return nullptr; } @@ -285,7 +283,7 @@ ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add, return nullptr; } - unsigned BaseReg = Ldst->getOperand(BasePos).getReg(); + Register BaseReg = Ldst->getOperand(BasePos).getReg(); // prohibit this: // v1 = add v0, c @@ -294,7 +292,7 @@ ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add, // st v0, [v0, 0] // v1 = add v0, c if (Ldst->mayStore() && Ldst->getOperand(0).isReg()) { - unsigned StReg = Ldst->getOperand(0).getReg(); + Register StReg = Ldst->getOperand(0).getReg(); if (Add->getOperand(0).getReg() == StReg || BaseReg == StReg) { LLVM_DEBUG(dbgs() << "[canJoinInstructions] Store uses result of Add\n"); return nullptr; @@ -447,7 +445,7 @@ void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode, MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF); AII->getBaseAndOffsetPosition(Ldst, BasePos, OffPos); - unsigned BaseReg = Ldst.getOperand(BasePos).getReg(); + Register BaseReg = Ldst.getOperand(BasePos).getReg(); Ldst.RemoveOperand(OffPos); Ldst.RemoveOperand(BasePos); diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp index 9c8340ac8f81..a7f89b385ffe 100644 --- a/lib/Target/ARC/ARCRegisterInfo.cpp +++ b/lib/Target/ARC/ARCRegisterInfo.cpp @@ -206,7 +206,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, LLVM_DEBUG(dbgs() << "Offset : " << Offset << "\n" << "<--------->\n"); - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); assert(ARC::GPR32RegClass.contains(Reg) && "Unexpected register operand"); if (!TFI->hasFP(MF)) { diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp index 9fb45d686c26..34700dc22c54 100644 --- a/lib/Target/ARC/ARCTargetMachine.cpp +++ b/lib/Target/ARC/ARCTargetMachine.cpp @@ -38,7 +38,7 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT, "f32:32:32-i64:32-f64:32-a:0:32-n32", TT, CPU, FS, Options, getRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - TLOF(make_unique<TargetLoweringObjectFileELF>()), + TLOF(std::make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index fb238bfc9cbc..30b9c8071ba2 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -133,9 +133,9 @@ bool A15SDOptimizer::usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC) { if (!MO.isReg()) return false; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI->getRegClass(Reg)->hasSuperClassEq(TRC); else return TRC->contains(Reg); @@ -151,7 +151,7 @@ unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) { // Get the subreg type that is most likely to be coalesced // for an SPR register that will be used in VDUP32d pseudo. unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { - if (!TRI->isVirtualRegister(SReg)) + if (!Register::isVirtualRegister(SReg)) return getDPRLaneFromSPR(SReg); MachineInstr *MI = MRI->getVRegDef(SReg); @@ -166,7 +166,7 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { SReg = MI->getOperand(1).getReg(); } - if (TargetRegisterInfo::isVirtualRegister(SReg)) { + if (Register::isVirtualRegister(SReg)) { if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1; return ARM::ssub_0; } @@ -191,8 +191,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { for (MachineOperand &MO : MI->operands()) { if ((!MO.isReg()) || (!MO.isUse())) continue; - unsigned Reg = MO.getReg(); - if (!TRI->isVirtualRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg)) continue; MachineOperand *Op = MI->findRegisterDefOperand(Reg); @@ -213,8 +213,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { for (MachineOperand &MODef : Def->operands()) { if ((!MODef.isReg()) || (!MODef.isDef())) continue; - unsigned DefReg = MODef.getReg(); - if (!TRI->isVirtualRegister(DefReg)) { + Register DefReg = MODef.getReg(); + if (!Register::isVirtualRegister(DefReg)) { IsDead = false; break; } @@ -245,10 +245,10 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { } if (MI->isInsertSubreg()) { - unsigned DPRReg = MI->getOperand(1).getReg(); - unsigned SPRReg = MI->getOperand(2).getReg(); + Register DPRReg = MI->getOperand(1).getReg(); + Register SPRReg = MI->getOperand(2).getReg(); - if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) { + if (Register::isVirtualRegister(DPRReg) && Register::isVirtualRegister(SPRReg)) { MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg()); MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg()); @@ -267,7 +267,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { // Find the thing we're subreg copying out of - is it of the same // regclass as DPRMI? (i.e. a DPR or QPR). - unsigned FullReg = SPRMI->getOperand(1).getReg(); + Register FullReg = SPRMI->getOperand(1).getReg(); const TargetRegisterClass *TRC = MRI->getRegClass(MI->getOperand(1).getReg()); if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) { @@ -296,9 +296,9 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { if (!MI->getOperand(I).isReg()) continue; ++NumTotal; - unsigned OpReg = MI->getOperand(I).getReg(); + Register OpReg = MI->getOperand(I).getReg(); - if (!TRI->isVirtualRegister(OpReg)) + if (!Register::isVirtualRegister(OpReg)) break; MachineInstr *Def = MRI->getVRegDef(OpReg); @@ -342,7 +342,7 @@ bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) { MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) { if (!MI->isFullCopy()) return MI; - if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) + if (!Register::isVirtualRegister(MI->getOperand(1).getReg())) return nullptr; MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg()); if (!Def) @@ -369,8 +369,8 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, Reached.insert(MI); if (MI->isPHI()) { for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - unsigned Reg = MI->getOperand(I).getReg(); - if (!TRI->isVirtualRegister(Reg)) { + Register Reg = MI->getOperand(I).getReg(); + if (!Register::isVirtualRegister(Reg)) { continue; } MachineInstr *NewMI = MRI->getVRegDef(Reg); @@ -379,7 +379,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, Front.push_back(NewMI); } } else if (MI->isFullCopy()) { - if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) + if (!Register::isVirtualRegister(MI->getOperand(1).getReg())) continue; MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg()); if (!NewMI) @@ -418,8 +418,8 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned Reg, unsigned Lane, bool QPR) { - unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : - &ARM::DPRRegClass); + Register Out = + MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass); BuildMI(MBB, InsertBefore, DL, TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out) .addReg(Reg) @@ -434,7 +434,7 @@ unsigned A15SDOptimizer::createExtractSubreg( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned DReg, unsigned Lane, const TargetRegisterClass *TRC) { - unsigned Out = MRI->createVirtualRegister(TRC); + Register Out = MRI->createVirtualRegister(TRC); BuildMI(MBB, InsertBefore, DL, @@ -448,7 +448,7 @@ unsigned A15SDOptimizer::createExtractSubreg( unsigned A15SDOptimizer::createRegSequence( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned Reg1, unsigned Reg2) { - unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass); + Register Out = MRI->createVirtualRegister(&ARM::QPRRegClass); BuildMI(MBB, InsertBefore, DL, @@ -466,7 +466,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1) { - unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); + Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass); BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out) .addReg(Ssub0) .addReg(Ssub1) @@ -478,7 +478,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, unsigned A15SDOptimizer::createInsertSubreg( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) { - unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); + Register Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); BuildMI(MBB, InsertBefore, DL, @@ -494,7 +494,7 @@ unsigned A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL) { - unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); + Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass); BuildMI(MBB, InsertBefore, DL, @@ -602,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { // we can end up with multiple defs of this DPR. SmallVector<MachineInstr *, 8> DefSrcs; - if (!TRI->isVirtualRegister(*I)) + if (!Register::isVirtualRegister(*I)) continue; MachineInstr *Def = MRI->getVRegDef(*I); if (!Def) @@ -622,7 +622,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { // Collect all the uses of this MI's DPR def for updating later. SmallVector<MachineOperand*, 8> Uses; - unsigned DPRDefReg = MI->getOperand(0).getReg(); + Register DPRDefReg = MI->getOperand(0).getReg(); for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg), E = MRI->use_end(); I != E; ++I) Uses.push_back(&*I); diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index bf8ed6562fe7..2e6f756d522c 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -35,6 +35,7 @@ class MachineInstr; class MCInst; class PassRegistry; +Pass *createMVETailPredicationPass(); FunctionPass *createARMLowOverheadLoopsPass(); Pass *createARMParallelDSPPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, @@ -67,6 +68,7 @@ void initializeThumb2SizeReducePass(PassRegistry &); void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); +void initializeMVETailPredicationPass(PassRegistry &); } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index b687db12eaf5..fed4cb2b9316 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -57,12 +57,15 @@ def FeatureD32 : SubtargetFeature<"d32", "HasD32", "true", "Extend FP to 32 double registers">; multiclass VFPver<string name, string query, string description, - list<SubtargetFeature> prev = [], - list<SubtargetFeature> otherimplies = []> { + list<SubtargetFeature> prev, + list<SubtargetFeature> otherimplies, + list<SubtargetFeature> vfp2prev = []> { def _D16_SP: SubtargetFeature< name#"d16sp", query#"D16SP", "true", description#" with only 16 d-registers and no double precision", - !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # otherimplies>; + !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # + !foreach(v, vfp2prev, !cast<SubtargetFeature>(v # "_SP")) # + otherimplies>; def _SP: SubtargetFeature< name#"sp", query#"SP", "true", description#" with no double precision", @@ -72,6 +75,7 @@ multiclass VFPver<string name, string query, string description, name#"d16", query#"D16", "true", description#" with only 16 d-registers", !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16")) # + vfp2prev # otherimplies # [FeatureFP64, !cast<SubtargetFeature>(NAME # "_D16_SP")]>; def "": SubtargetFeature< name, query, "true", description, @@ -80,11 +84,17 @@ multiclass VFPver<string name, string query, string description, !cast<SubtargetFeature>(NAME # "_SP")]>; } -defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions", - [], [FeatureFPRegs]>; +def FeatureVFP2_SP : SubtargetFeature<"vfp2sp", "HasVFPv2SP", "true", + "Enable VFP2 instructions with " + "no double precision", + [FeatureFPRegs]>; + +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", + "Enable VFP2 instructions", + [FeatureFP64, FeatureVFP2_SP]>; defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions", - [FeatureVFP2]>; + [], [], [FeatureVFP2]>; def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable NEON instructions", @@ -98,7 +108,7 @@ defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions", [FeatureVFP3], [FeatureFP16]>; defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP", - [FeatureVFP4]>; + [FeatureVFP4], []>; def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "Enable full half-precision " @@ -302,9 +312,18 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; -def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2", +def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2", "Prefer 32-bit alignment for loops">; +def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1", + "Model MVE instructions as a 1 beat per tick architecture">; + +def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2", + "Model MVE instructions as a 2 beats per tick architecture">; + +def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4", + "Model MVE instructions as a 4 beats per tick architecture">; + /// Some instructions update CPSR partially, which can add false dependency for /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is /// mapped to a separate physical register. Avoid partial CPSR update for these @@ -1156,6 +1175,13 @@ def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76, FeatureFullFP16, FeatureDotProd]>; +def : ProcNoItin<"neoverse-n1", [ARMv82a, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureDotProd]>; + def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index e29077266fcd..c8c91e53c44e 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -168,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // relatively easy to exceed the thumb branch range within a TU. if (! ThumbIndirectPads.empty()) { OutStreamer->EmitAssemblerFlag(MCAF_Code16); - EmitAlignment(1); + EmitAlignment(Align(2)); for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) { OutStreamer->EmitLabel(TIP.second); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX) @@ -203,8 +203,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, switch (MO.getType()) { default: llvm_unreachable("<unknown operand type>"); case MachineOperand::MO_Register: { - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + Register Reg = MO.getReg(); + assert(Register::isPhysicalRegister(Reg)); assert(!MO.getSubReg() && "Subregs should be eliminated!"); if(ARM::GPRPairRegClass.contains(Reg)) { const MachineFunction &MF = *MI->getParent()->getParent(); @@ -275,7 +275,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, return false; case 'y': // Print a VFP single precision register as indexed double. if (MI->getOperand(OpNum).isReg()) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + Register Reg = MI->getOperand(OpNum).getReg(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); // Find the 'd' register that has this 's' register as a sub-register, // and determine the lane number. @@ -302,14 +302,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (!MI->getOperand(OpNum).isReg()) return true; const MachineOperand &MO = MI->getOperand(OpNum); - unsigned RegBegin = MO.getReg(); + Register RegBegin = MO.getReg(); // This takes advantage of the 2 operand-ness of ldm/stm and that we've // already got the operands in registers that are operands to the // inline asm statement. O << "{"; if (ARM::GPRPairRegClass.contains(RegBegin)) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0); + Register Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0); O << ARMInstPrinter::getRegisterName(Reg0) << ", "; RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1); } @@ -378,8 +378,8 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (!MO.isReg()) return true; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ? - ARM::gsub_0 : ARM::gsub_1); + Register Reg = + TRI->getSubReg(MO.getReg(), FirstHalf ? ARM::gsub_0 : ARM::gsub_1); O << ARMInstPrinter::getRegisterName(Reg); return false; } @@ -391,7 +391,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const MachineOperand &MO = MI->getOperand(RegOp); if (!MO.isReg()) return true; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); O << ARMInstPrinter::getRegisterName(Reg); return false; } @@ -400,12 +400,12 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 'f': { // The high doubleword register of a NEON quad register. if (!MI->getOperand(OpNum).isReg()) return true; - unsigned Reg = MI->getOperand(OpNum).getReg(); + Register Reg = MI->getOperand(OpNum).getReg(); if (!ARM::QPRRegClass.contains(Reg)) return true; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? - ARM::dsub_0 : ARM::dsub_1); + Register SubReg = + TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? ARM::dsub_0 : ARM::dsub_1); O << ARMInstPrinter::getRegisterName(SubReg); return false; } @@ -419,7 +419,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, return true; const MachineFunction &MF = *MI->getParent()->getParent(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if(!ARM::GPRPairRegClass.contains(Reg)) return false; Reg = TRI->getSubReg(Reg, ARM::gsub_1); @@ -526,7 +526,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); - EmitAlignment(2); + EmitAlignment(Align(4)); for (auto &Stub : Stubs) emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); @@ -539,7 +539,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection()); - EmitAlignment(2); + EmitAlignment(Align(4)); for (auto &Stub : Stubs) emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); @@ -940,7 +940,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) { // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for // ARM mode tables. - EmitAlignment(2); + EmitAlignment(Align(4)); // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); @@ -986,7 +986,7 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) { // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for // ARM mode tables. - EmitAlignment(2); + EmitAlignment(Align(4)); // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); @@ -1015,7 +1015,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, unsigned JTI = MO1.getIndex(); if (Subtarget->isThumb1Only()) - EmitAlignment(2); + EmitAlignment(Align(4)); MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); OutStreamer->EmitLabel(JTISymbol); @@ -1058,7 +1058,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); // Make sure the next instruction is 2-byte aligned. - EmitAlignment(1); + EmitAlignment(Align(2)); } void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { @@ -1072,7 +1072,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo(); - unsigned FramePtr = TargetRegInfo->getFrameRegister(MF); + Register FramePtr = TargetRegInfo->getFrameRegister(MF); unsigned Opc = MI->getOpcode(); unsigned SrcReg, DstReg; @@ -1136,7 +1136,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { } // Check for registers that are remapped (for a Thumb1 prologue that // saves high registers). - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg)) Reg = RemappedReg; RegList.push_back(Reg); @@ -1326,7 +1326,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // So here we generate a bl to a small jump pad that does bx rN. // The jump pads are emitted after the function body. - unsigned TReg = MI->getOperand(0).getReg(); + Register TReg = MI->getOperand(0).getReg(); MCSymbol *TRegSym = nullptr; for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) { if (TIP.first == TReg) { @@ -1663,8 +1663,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { case ARM::tTBH_JT: { bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT; - unsigned Base = MI->getOperand(0).getReg(); - unsigned Idx = MI->getOperand(1).getReg(); + Register Base = MI->getOperand(0).getReg(); + Register Idx = MI->getOperand(1).getReg(); assert(MI->getOperand(1).isKill() && "We need the index register as scratch!"); // Multiply up idx if necessary. @@ -1844,8 +1844,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // b LSJLJEH // movs r0, #1 // LSJLJEH: - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ValReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ValReg = MI->getOperand(1).getReg(); MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) @@ -1910,8 +1910,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // mov r0, #0 // add pc, pc, #0 // mov r0, #1 - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ValReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ValReg = MI->getOperand(1).getReg(); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri) @@ -1967,8 +1967,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // ldr $scratch, [$src, #4] // ldr r7, [$src] // bx $scratch - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ScratchReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ScratchReg = MI->getOperand(1).getReg(); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12) .addReg(ARM::SP) .addReg(SrcReg) @@ -2027,8 +2027,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // ldr $scratch, [$src, #4] // ldr r7, [$src] // bx $scratch - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ScratchReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ScratchReg = MI->getOperand(1).getReg(); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) .addReg(ScratchReg) @@ -2095,7 +2095,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // ldr.w sp, [$src, #8] // ldr.w pc, [$src, #4] - unsigned SrcReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12) .addReg(ARM::R11) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 222aa85856a2..684cd1def977 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -172,9 +172,9 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress( const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0); const MachineOperand &Base = MI.getOperand(2); const MachineOperand &Offset = MI.getOperand(NumOps - 3); - unsigned WBReg = WB.getReg(); - unsigned BaseReg = Base.getReg(); - unsigned OffReg = Offset.getReg(); + Register WBReg = WB.getReg(); + Register BaseReg = Base.getReg(); + Register OffReg = Offset.getReg(); unsigned OffImm = MI.getOperand(NumOps - 2).getImm(); ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm(); switch (AddrMode) { @@ -276,8 +276,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress( if (LV) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); - if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - unsigned Reg = MO.getReg(); + if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) { + Register Reg = MO.getReg(); LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); if (MO.isDef()) { @@ -966,8 +966,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, SmallSet<unsigned, 4> DstRegs; #endif for (unsigned i = 0; i != SubRegs; ++i) { - unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); - unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); + Register Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); + Register Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); assert(Dst && Src && "Bad sub-register"); #ifndef NDEBUG assert(!DstRegs.count(Src) && "destructive vector copy"); @@ -1019,7 +1019,7 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, if (!SubIdx) return MIB.addReg(Reg, State); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } @@ -1133,7 +1133,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, case 24: if (ARM::DTripleRegClass.hasSubClassEq(RC)) { // Use aligned spills if the stack can be realigned. - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo)) .addFrameIndex(FI) .addImm(16) @@ -1155,7 +1156,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 32: if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { // FIXME: It's possible to only store part of the QQ register if the // spilled def has a sub-register index. BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo)) @@ -1337,7 +1339,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); } - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } else llvm_unreachable("Unknown reg class!"); @@ -1368,7 +1370,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 24: if (ARM::DTripleRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg) .addFrameIndex(FI) .addImm(16) @@ -1382,7 +1385,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } } else @@ -1390,7 +1393,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 32: if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) .addFrameIndex(FI) .addImm(16) @@ -1405,7 +1409,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } } else @@ -1425,7 +1429,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } else llvm_unreachable("Unknown reg class!"); @@ -1583,8 +1587,8 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // Look for a copy between even S-registers. That is where we keep floats // when using NEON v2f32 instructions for f32 arithmetic. - unsigned DstRegS = MI.getOperand(0).getReg(); - unsigned SrcRegS = MI.getOperand(1).getReg(); + Register DstRegS = MI.getOperand(0).getReg(); + Register SrcRegS = MI.getOperand(1).getReg(); if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS)) return false; @@ -1794,12 +1798,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0, if (MI0.getNumOperands() != MI1.getNumOperands()) return false; - unsigned Addr0 = MI0.getOperand(1).getReg(); - unsigned Addr1 = MI1.getOperand(1).getReg(); + Register Addr0 = MI0.getOperand(1).getReg(); + Register Addr1 = MI1.getOperand(1).getReg(); if (Addr0 != Addr1) { - if (!MRI || - !TargetRegisterInfo::isVirtualRegister(Addr0) || - !TargetRegisterInfo::isVirtualRegister(Addr1)) + if (!MRI || !Register::isVirtualRegister(Addr0) || + !Register::isVirtualRegister(Addr1)) return false; // This assumes SSA form. @@ -2076,6 +2079,38 @@ isProfitableToIfCvt(MachineBasicBlock &TBB, return PredCost <= UnpredCost; } +unsigned +ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF, + unsigned NumInsts) const { + // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions. + // ARM has a condition code field in every predicable instruction, using it + // doesn't change code size. + return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0; +} + +unsigned +ARMBaseInstrInfo::predictBranchSizeForIfCvt(MachineInstr &MI) const { + // If this branch is likely to be folded into the comparison to form a + // CB(N)Z, then removing it won't reduce code size at all, because that will + // just replace the CB(N)Z with a CMP. + if (MI.getOpcode() == ARM::t2Bcc && + findCMPToFoldIntoCBZ(&MI, &getRegisterInfo())) + return 0; + + unsigned Size = getInstSizeInBytes(MI); + + // For Thumb2, all branches are 32-bit instructions during the if conversion + // pass, but may be replaced with 16-bit instructions during size reduction. + // Since the branches considered by if conversion tend to be forward branches + // over small basic blocks, they are very likely to be in range for the + // narrow instructions, so we assume the final code size will be half what it + // currently is. + if (Subtarget.isThumb2()) + Size /= 2; + + return Size; +} + bool ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB) const { @@ -2141,7 +2176,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, MachineInstr * ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) return nullptr; @@ -2163,7 +2198,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, // MI can't have any tied operands, that would conflict with predication. if (MO.isTied()) return nullptr; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) return nullptr; if (MO.isDef() && !MO.isDead()) return nullptr; @@ -2211,7 +2246,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI, // Find new register class to use. MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg()); if (!MRI.constrainRegClass(DestReg, PreviousClass)) return nullptr; @@ -2298,6 +2333,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { {ARM::tSUBSrr, ARM::tSUBrr}, {ARM::tSBCS, ARM::tSBC}, {ARM::tRSBS, ARM::tRSB}, + {ARM::tLSLSri, ARM::tLSLri}, {ARM::t2ADDSri, ARM::t2ADDri}, {ARM::t2ADDSrr, ARM::t2ADDrr}, @@ -2420,7 +2456,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, MachineOperand &MO = MI->getOperand(i); RegList.push_back(MO); - if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc) + if (MO.isReg() && !MO.isImplicit() && + TRI->getEncodingValue(MO.getReg()) < FirstRegEnc) FirstRegEnc = TRI->getEncodingValue(MO.getReg()); } @@ -2430,7 +2467,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded; --CurRegEnc) { unsigned CurReg = RegClass->getRegister(CurRegEnc); - if (IsT1PushPop && CurReg > ARM::R7) + if (IsT1PushPop && CurRegEnc > TRI->getEncodingValue(ARM::R7)) continue; if (!IsPop) { // Pushing any register is completely harmless, mark the register involved @@ -3039,18 +3076,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( break; case ARM::VSELEQD: case ARM::VSELEQS: + case ARM::VSELEQH: CC = ARMCC::EQ; break; case ARM::VSELGTD: case ARM::VSELGTS: + case ARM::VSELGTH: CC = ARMCC::GT; break; case ARM::VSELGED: case ARM::VSELGES: + case ARM::VSELGEH: CC = ARMCC::GE; break; - case ARM::VSELVSS: case ARM::VSELVSD: + case ARM::VSELVSS: + case ARM::VSELVSH: CC = ARMCC::VS; break; } @@ -3271,9 +3312,9 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } unsigned OpIdx = Commute ? 2 : 1; - unsigned Reg1 = UseMI.getOperand(OpIdx).getReg(); + Register Reg1 = UseMI.getOperand(OpIdx).getReg(); bool isKill = UseMI.getOperand(OpIdx).isKill(); - unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); + Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc), NewReg) .addReg(Reg1, getKillRegState(isKill)) @@ -3335,15 +3376,15 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRSB_POST: case ARM::LDRSH_POST: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); return (Rt == Rm) ? 4 : 3; } case ARM::LDR_PRE_REG: case ARM::LDRB_PRE_REG: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rt == Rm) return 3; unsigned ShOpVal = MI.getOperand(4).getImm(); @@ -3372,8 +3413,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRH_PRE: case ARM::STRH_PRE: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (!Rm) return 2; if (Rt == Rm) @@ -3384,8 +3425,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDR_POST_REG: case ARM::LDRB_POST_REG: case ARM::LDRH_POST: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); return (Rt == Rm) ? 3 : 2; } @@ -3404,10 +3445,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRSB_PRE: case ARM::LDRSH_PRE: { - unsigned Rm = MI.getOperand(3).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rm == 0) return 3; - unsigned Rt = MI.getOperand(0).getReg(); + Register Rt = MI.getOperand(0).getReg(); if (Rt == Rm) return 4; unsigned ShOpVal = MI.getOperand(4).getImm(); @@ -3422,9 +3463,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::LDRD: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(2).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(2).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4 : 3; @@ -3432,7 +3473,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::STRD: { - unsigned Rm = MI.getOperand(3).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4 : 3; @@ -3448,9 +3489,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, return 4; case ARM::LDRD_PRE: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(3).getReg(); - unsigned Rm = MI.getOperand(4).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(3).getReg(); + Register Rm = MI.getOperand(4).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5 : 4; @@ -3458,13 +3499,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::t2LDRD_PRE: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(3).getReg(); return (Rt == Rn) ? 4 : 3; } case ARM::STRD_PRE: { - unsigned Rm = MI.getOperand(4).getReg(); + Register Rm = MI.getOperand(4).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5 : 4; @@ -3495,8 +3536,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, return 2; case ARM::t2LDRDi8: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(2).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(2).getReg(); return (Rt == Rn) ? 3 : 2; } @@ -3745,7 +3786,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, } bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const { - unsigned BaseReg = MI.getOperand(0).getReg(); + Register BaseReg = MI.getOperand(0).getReg(); for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) { const auto &Op = MI.getOperand(i); if (Op.isReg() && Op.getReg() == BaseReg) @@ -4219,7 +4260,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return -1; const MachineOperand &DefMO = DefMI.getOperand(DefIdx); - unsigned Reg = DefMO.getReg(); + Register Reg = DefMO.getReg(); const MachineInstr *ResolvedDefMI = &DefMI; unsigned DefAdj = 0; @@ -4328,10 +4369,10 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); - const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode); + auto *DefMN = cast<MachineSDNode>(DefNode); unsigned DefAlign = !DefMN->memoperands_empty() ? (*DefMN->memoperands_begin())->getAlignment() : 0; - const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode); + auto *UseMN = cast<MachineSDNode>(UseNode); unsigned UseAlign = !UseMN->memoperands_empty() ? (*UseMN->memoperands_begin())->getAlignment() : 0; int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, @@ -4708,7 +4749,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, if (MI.getOperand(i).isImplicit() || !MI.getOperand(i).isReg()) continue; - unsigned Reg = MI.getOperand(i).getReg(); + Register Reg = MI.getOperand(i).getReg(); if (Reg < ARM::R0 || Reg > ARM::R7) { if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) && !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) { @@ -4731,7 +4772,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + Register Reg = MI->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue()); MachineInstrBuilder MIB; @@ -5104,7 +5145,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance( const MachineOperand &MO = MI.getOperand(OpNum); if (MO.readsReg()) return 0; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); int UseOp = -1; switch (MI.getOpcode()) { @@ -5134,7 +5175,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance( return 0; // We must be able to clobber the whole D-reg. - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { // Virtual register must be a def undef foo:ssub_0 operand. if (!MO.getSubReg() || MI.readsVirtualRegister(Reg)) return 0; @@ -5159,8 +5200,8 @@ void ARMBaseInstrInfo::breakPartialRegDependency( assert(TRI && "Need TRI instance"); const MachineOperand &MO = MI.getOperand(OpNum); - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg) && + Register Reg = MO.getReg(); + assert(Register::isPhysicalRegister(Reg) && "Can't break virtual register dependencies."); unsigned DReg = Reg; @@ -5337,7 +5378,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br, // is not redefined between the cmp and the br. if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri) return nullptr; - unsigned Reg = CmpMI->getOperand(0).getReg(); + Register Reg = CmpMI->getOperand(0).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg); if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0) @@ -5349,3 +5390,50 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br, return &*CmpMI; } + +unsigned llvm::ConstantMaterializationCost(unsigned Val, + const ARMSubtarget *Subtarget, + bool ForCodesize) { + if (Subtarget->isThumb()) { + if (Val <= 255) // MOV + return ForCodesize ? 2 : 1; + if (Subtarget->hasV6T2Ops() && (Val <= 0xffff || // MOV + ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW + ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN + return ForCodesize ? 4 : 1; + if (Val <= 510) // MOV + ADDi8 + return ForCodesize ? 4 : 2; + if (~Val <= 255) // MOV + MVN + return ForCodesize ? 4 : 2; + if (ARM_AM::isThumbImmShiftedVal(Val)) // MOV + LSL + return ForCodesize ? 4 : 2; + } else { + if (ARM_AM::getSOImmVal(Val) != -1) // MOV + return ForCodesize ? 4 : 1; + if (ARM_AM::getSOImmVal(~Val) != -1) // MVN + return ForCodesize ? 4 : 1; + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) // MOVW + return ForCodesize ? 4 : 1; + if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs + return ForCodesize ? 8 : 2; + } + if (Subtarget->useMovt()) // MOVW + MOVT + return ForCodesize ? 8 : 2; + return ForCodesize ? 8 : 3; // Literal pool load +} + +bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, + const ARMSubtarget *Subtarget, + bool ForCodesize) { + // Check with ForCodesize + unsigned Cost1 = ConstantMaterializationCost(Val1, Subtarget, ForCodesize); + unsigned Cost2 = ConstantMaterializationCost(Val2, Subtarget, ForCodesize); + if (Cost1 < Cost2) + return true; + if (Cost1 > Cost2) + return false; + + // If they are equal, try with !ForCodesize + return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) < + ConstantMaterializationCost(Val2, Subtarget, !ForCodesize); +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index c28983fcc15c..c232b6f0b45d 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -276,6 +276,10 @@ public: return NumCycles == 1; } + unsigned extraSizeToPredicateInstructions(const MachineFunction &MF, + unsigned NumInsts) const override; + unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override; + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB) const override; @@ -601,7 +605,8 @@ bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII); + const ARMBaseInstrInfo &TII, + const TargetRegisterInfo *TRI); /// Return true if Reg is defd between From and To bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, @@ -620,6 +625,20 @@ void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, unsigned Inactive); +/// Returns the number of instructions required to materialize the given +/// constant in a register, or 3 if a literal pool load is needed. +/// If ForCodesize is specified, an approximate cost in bytes is returned. +unsigned ConstantMaterializationCost(unsigned Val, + const ARMSubtarget *Subtarget, + bool ForCodesize = false); + +/// Returns true if Val1 has a lower Constant Materialization Cost than Val2. +/// Uses the cost from ConstantMaterializationCost, first with ForCodesize as +/// specified. If the scores are equal, return the comparison for !ForCodesize. +bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, + const ARMSubtarget *Subtarget, + bool ForCodesize = false); + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index dc99b37742da..1eaf871867e0 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -174,6 +174,12 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, : CSR_AAPCS_ThisReturn_RegMask; } +ArrayRef<MCPhysReg> ARMBaseRegisterInfo::getIntraCallClobberedRegs( + const MachineFunction *MF) const { + static const MCPhysReg IntraCallClobberedRegs[] = {ARM::R12}; + return ArrayRef<MCPhysReg>(IntraCallClobberedRegs); +} + BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); @@ -185,7 +191,7 @@ getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, ARM::PC); markSuperRegs(Reserved, ARM::FPSCR); markSuperRegs(Reserved, ARM::APSR_NZCV); - if (TFI->hasFP(MF)) + if (TFI->hasFP(MF) || STI.isTargetDarwin()) markSuperRegs(Reserved, getFramePointerReg(STI)); if (hasBasePointer(MF)) markSuperRegs(Reserved, BasePtr); @@ -217,7 +223,7 @@ isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const { const TargetRegisterClass * ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, - const MachineFunction &) const { + const MachineFunction &MF) const { const TargetRegisterClass *Super = RC; TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); do { @@ -225,11 +231,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case ARM::GPRRegClassID: case ARM::SPRRegClassID: case ARM::DPRRegClassID: + case ARM::GPRPairRegClassID: + return Super; case ARM::QPRRegClassID: case ARM::QQPRRegClassID: case ARM::QQQQPRRegClassID: - case ARM::GPRPairRegClassID: - return Super; + if (MF.getSubtarget<ARMSubtarget>().hasNEON()) + return Super; } Super = *I++; } while (Super); @@ -317,7 +325,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, return false; unsigned PairedPhys = 0; - if (TargetRegisterInfo::isPhysicalRegister(Paired)) { + if (Register::isPhysicalRegister(Paired)) { PairedPhys = Paired; } else if (VRM && VRM->hasPhys(Paired)) { PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this); @@ -347,7 +355,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); if ((Hint.first == (unsigned)ARMRI::RegPairOdd || Hint.first == (unsigned)ARMRI::RegPairEven) && - TargetRegisterInfo::isVirtualRegister(Hint.second)) { + Register::isVirtualRegister(Hint.second)) { // If 'Reg' is one of the even / odd register pair and it's now changed // (e.g. coalesced) into a different register. The other register of the // pair allocation hint must be updated to reflect the relationship @@ -357,7 +365,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, // Make sure the pair has not already divorced. if (Hint.second == Reg) { MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); - if (TargetRegisterInfo::isVirtualRegister(NewReg)) + if (Register::isVirtualRegister(NewReg)) MRI->setRegAllocationHint(NewReg, Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven : ARMRI::RegPairOdd, OtherReg); @@ -663,7 +671,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII); else { assert(AFI->isThumb2Function()); - Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII); + Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII, this); } assert(Done && "Unable to resolve frame index!"); (void)Done; @@ -775,7 +783,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); else { assert(AFI->isThumb2Function()); - Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); + Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII, this); } if (Done) return; @@ -783,21 +791,32 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above, handle the rest, providing a register that is // SP+LargeImm. - assert((Offset || - (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || - (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) && - "This code isn't needed if offset already handled!"); + assert( + (Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7s2 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == + ARMII::AddrModeT2_i7s4) && + "This code isn't needed if offset already handled!"); unsigned ScratchReg = 0; int PIdx = MI.findFirstPredOperandIdx(); ARMCC::CondCodes Pred = (PIdx == -1) ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg(); - if (Offset == 0) + + const MCInstrDesc &MCID = MI.getDesc(); + const TargetRegisterClass *RegClass = + TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent()); + + if (Offset == 0 && + (Register::isVirtualRegister(FrameReg) || RegClass->contains(FrameReg))) // Must be addrmode4/6. MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false); else { - ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass); + ScratchReg = MF.getRegInfo().createVirtualRegister(RegClass); if (!AFI->isThumbFunction()) emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, Pred, PredReg, TII); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 7e2c72b4d712..477f3ad0a9a7 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -129,6 +129,9 @@ public: const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const; + ArrayRef<MCPhysReg> + getIntraCallClobberedRegs(const MachineFunction *MF) const override; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const override; @@ -176,8 +179,6 @@ public: Register getFrameRegister(const MachineFunction &MF) const override; unsigned getBaseRegister() const { return BasePtr; } - bool isLowRegister(unsigned Reg) const; - /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp index 2de90e816b33..00a2231f59e3 100644 --- a/lib/Target/ARM/ARMBasicBlockInfo.cpp +++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// +#include "ARMBasicBlockInfo.h" #include "ARM.h" #include "ARMBaseInstrInfo.h" -#include "ARMBasicBlockInfo.h" #include "ARMMachineFunctionInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/Support/Debug.h" #include <vector> #define DEBUG_TYPE "arm-bb-utils" @@ -47,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) { BasicBlockInfo &BBI = BBInfo[MBB->getNumber()]; BBI.Size = 0; BBI.Unalign = 0; - BBI.PostAlign = 0; + BBI.PostAlign = Align::None(); for (MachineInstr &I : *MBB) { BBI.Size += TII->getInstSizeInBytes(I); @@ -62,8 +64,8 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) { // tBR_JTr contains a .align 2 directive. if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) { - BBI.PostAlign = 2; - MBB->getParent()->ensureAlignment(2); + BBI.PostAlign = Align(4); + MBB->getParent()->ensureAlignment(Align(4)); } } @@ -126,9 +128,9 @@ void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) { for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) { // Get the offset and known bits at the end of the layout predecessor. // Include the alignment of the current block. - unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment(); - unsigned Offset = BBInfo[i - 1].postOffset(LogAlign); - unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign); + const Align Align = MF.getBlockNumbered(i)->getAlignment(); + const unsigned Offset = BBInfo[i - 1].postOffset(Align); + const unsigned KnownBits = BBInfo[i - 1].postKnownBits(Align); // This is where block i begins. Stop if the offset is already correct, // and we have updated 2 blocks. This is the maximum number of blocks diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h index 400bba351cec..13df399ed995 100644 --- a/lib/Target/ARM/ARMBasicBlockInfo.h +++ b/lib/Target/ARM/ARMBasicBlockInfo.h @@ -21,17 +21,18 @@ namespace llvm { +struct BasicBlockInfo; using BBInfoVector = SmallVectorImpl<BasicBlockInfo>; /// UnknownPadding - Return the worst case padding that could result from /// unknown offset bits. This does not include alignment padding caused by /// known offset bits. /// -/// @param LogAlign log2(alignment) +/// @param Alignment alignment /// @param KnownBits Number of known low offset bits. -inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) { - if (KnownBits < LogAlign) - return (1u << LogAlign) - (1u << KnownBits); +inline unsigned UnknownPadding(Align Alignment, unsigned KnownBits) { + if (KnownBits < Log2(Alignment)) + return Alignment.value() - (1ull << KnownBits); return 0; } @@ -65,10 +66,9 @@ struct BasicBlockInfo { /// multiple of 1 << Unalign. uint8_t Unalign = 0; - /// PostAlign - When non-zero, the block terminator contains a .align - /// directive, so the end of the block is aligned to 1 << PostAlign - /// bytes. - uint8_t PostAlign = 0; + /// PostAlign - When > 1, the block terminator contains a .align + /// directive, so the end of the block is aligned to PostAlign bytes. + Align PostAlign; BasicBlockInfo() = default; @@ -84,16 +84,16 @@ struct BasicBlockInfo { return Bits; } - /// Compute the offset immediately following this block. If LogAlign is + /// Compute the offset immediately following this block. If Align is /// specified, return the offset the successor block will get if it has /// this alignment. - unsigned postOffset(unsigned LogAlign = 0) const { + unsigned postOffset(Align Alignment = Align::None()) const { unsigned PO = Offset + Size; - unsigned LA = std::max(unsigned(PostAlign), LogAlign); - if (!LA) + const Align PA = std::max(PostAlign, Alignment); + if (PA == Align::None()) return PO; // Add alignment padding from the terminator. - return PO + UnknownPadding(LA, internalKnownBits()); + return PO + UnknownPadding(PA, internalKnownBits()); } /// Compute the number of known low bits of postOffset. If this block @@ -101,9 +101,8 @@ struct BasicBlockInfo { /// instruction alignment. An aligned terminator may increase the number /// of know bits. /// If LogAlign is given, also consider the alignment of the next block. - unsigned postKnownBits(unsigned LogAlign = 0) const { - return std::max(std::max(unsigned(PostAlign), LogAlign), - internalKnownBits()); + unsigned postKnownBits(Align Align = Align::None()) const { + return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits()); } }; diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 0cbe6e1871e4..d3b595ce8323 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -90,6 +90,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MachineInstrBuilder &MIB, CCAssignFn *AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + bool isIncomingArgumentHandler() const override { return false; } + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) && @@ -169,8 +171,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, CCState &State) override { - if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State)) + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, + CCState &State) override { + if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State)) return true; StackSize = @@ -199,9 +202,8 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, if (SplitVTs.size() == 1) { // Even if there is no splitting to do, we still want to replace the // original type (e.g. pointer type -> integer). - auto Flags = OrigArg.Flags; - unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty); - Flags.setOrigAlign(OriginalAlignment); + auto Flags = OrigArg.Flags[0]; + Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty))); SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), Flags, OrigArg.IsFixed); return; @@ -211,10 +213,9 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) { EVT SplitVT = SplitVTs[i]; Type *SplitTy = SplitVT.getTypeForEVT(Ctx); - auto Flags = OrigArg.Flags; + auto Flags = OrigArg.Flags[0]; - unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy); - Flags.setOrigAlign(OriginalAlignment); + Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy))); bool NeedsConsecutiveRegisters = TLI.functionArgumentNeedsConsecutiveRegisters( @@ -286,7 +287,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { CCAssignFn AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn) {} - bool isArgumentHandler() const override { return true; } + bool isIncomingArgumentHandler() const override { return true; } Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -298,7 +299,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - unsigned AddrReg = + Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32)); MIRBuilder.buildFrameIndex(AddrReg, FI); @@ -405,6 +406,7 @@ struct FormalArgHandler : public IncomingValueHandler { : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -498,11 +500,7 @@ unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) { } } // end anonymous namespace -bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const { +bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { MachineFunction &MF = MIRBuilder.getMF(); const auto &TLI = *getTLI<ARMTargetLowering>(); const auto &DL = MF.getDataLayout(); @@ -520,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create the call instruction so we can add the implicit uses of arg // registers, but don't insert it yet. - bool IsDirect = !Callee.isReg(); + bool IsDirect = !Info.Callee.isReg(); auto CallOpcode = getCallOpcode(STI, IsDirect); auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode); @@ -528,35 +526,35 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (IsThumb) MIB.add(predOps(ARMCC::AL)); - MIB.add(Callee); + MIB.add(Info.Callee); if (!IsDirect) { - auto CalleeReg = Callee.getReg(); - if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) { + auto CalleeReg = Info.Callee.getReg(); + if (CalleeReg && !Register::isPhysicalRegister(CalleeReg)) { unsigned CalleeIdx = IsThumb ? 2 : 0; MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass( MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(), - *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx)); + *MIB.getInstr(), MIB->getDesc(), Info.Callee, CalleeIdx)); } } - MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv)); + MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv)); bool IsVarArg = false; SmallVector<ArgInfo, 8> ArgInfos; - for (auto Arg : OrigArgs) { + for (auto Arg : Info.OrigArgs) { if (!isSupportedType(DL, TLI, Arg.Ty)) return false; if (!Arg.IsFixed) IsVarArg = true; - if (Arg.Flags.isByVal()) + if (Arg.Flags[0].isByVal()) return false; splitToValueTypes(Arg, ArgInfos, MF); } - auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg); + auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg); OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler)) return false; @@ -564,13 +562,13 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Now we can add the actual call instruction to the correct basic block. MIRBuilder.insertInstr(MIB); - if (!OrigRet.Ty->isVoidTy()) { - if (!isSupportedType(DL, TLI, OrigRet.Ty)) + if (!Info.OrigRet.Ty->isVoidTy()) { + if (!isSupportedType(DL, TLI, Info.OrigRet.Ty)) return false; ArgInfos.clear(); - splitToValueTypes(OrigRet, ArgInfos, MF); - auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg); + splitToValueTypes(Info.OrigRet, ArgInfos, MF); + auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg); CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler)) return false; diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h index 794127b5ebc7..ddbc9feb90e2 100644 --- a/lib/Target/ARM/ARMCallLowering.h +++ b/lib/Target/ARM/ARMCallLowering.h @@ -38,9 +38,8 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; private: bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val, diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp index 5ede7c67f7c2..92ebc542b423 100644 --- a/lib/Target/ARM/ARMCallingConv.cpp +++ b/lib/Target/ARM/ARMCallingConv.cpp @@ -193,7 +193,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. auto &DL = State.getMachineFunction().getDataLayout(); - unsigned StackAlign = DL.getStackAlignment(); + unsigned StackAlign = DL.getStackAlignment().value(); unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); ArrayRef<MCPhysReg> RegList; diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp index 2fc5f4aaab50..1c2c8aef55bb 100644 --- a/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -179,16 +179,12 @@ public: } static bool GenerateSignBits(Value *V) { - if (auto *Arg = dyn_cast<Argument>(V)) - return Arg->hasSExtAttr(); - if (!isa<Instruction>(V)) return false; unsigned Opc = cast<Instruction>(V)->getOpcode(); return Opc == Instruction::AShr || Opc == Instruction::SDiv || - Opc == Instruction::SRem || Opc == Instruction::SExt || - Opc == Instruction::SIToFP; + Opc == Instruction::SRem || Opc == Instruction::SExt; } static bool EqualTypeSize(Value *V) { @@ -806,54 +802,48 @@ void IRPromoter::Mutate(Type *OrigTy, /// return value is zeroext. We don't allow opcodes that can introduce sign /// bits. bool ARMCodeGenPrepare::isSupportedValue(Value *V) { - if (auto *I = dyn_cast<ICmpInst>(V)) { - // Now that we allow small types than TypeSize, only allow icmp of - // TypeSize because they will require a trunc to be legalised. - // TODO: Allow icmp of smaller types, and calculate at the end - // whether the transform would be beneficial. - if (isa<PointerType>(I->getOperand(0)->getType())) + if (auto *I = dyn_cast<Instruction>(V)) { + switch (I->getOpcode()) { + default: + return isa<BinaryOperator>(I) && isSupportedType(I) && + !GenerateSignBits(I); + case Instruction::GetElementPtr: + case Instruction::Store: + case Instruction::Br: + case Instruction::Switch: return true; - return EqualTypeSize(I->getOperand(0)); - } - - if (GenerateSignBits(V)) { - LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n"); - return false; - } - - // Memory instructions - if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V)) - return true; - - // Branches and targets. - if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V)) - return true; - - // Non-instruction values that we can handle. - if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V)) + case Instruction::PHI: + case Instruction::Select: + case Instruction::Ret: + case Instruction::Load: + case Instruction::Trunc: + case Instruction::BitCast: + return isSupportedType(I); + case Instruction::ZExt: + return isSupportedType(I->getOperand(0)); + case Instruction::ICmp: + // Now that we allow small types than TypeSize, only allow icmp of + // TypeSize because they will require a trunc to be legalised. + // TODO: Allow icmp of smaller types, and calculate at the end + // whether the transform would be beneficial. + if (isa<PointerType>(I->getOperand(0)->getType())) + return true; + return EqualTypeSize(I->getOperand(0)); + case Instruction::Call: { + // Special cases for calls as we need to check for zeroext + // TODO We should accept calls even if they don't have zeroext, as they + // can still be sinks. + auto *Call = cast<CallInst>(I); + return isSupportedType(Call) && + Call->hasRetAttr(Attribute::AttrKind::ZExt); + } + } + } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) { return isSupportedType(V); - - if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) || - isa<LoadInst>(V)) + } else if (isa<Argument>(V)) return isSupportedType(V); - if (auto *Cast = dyn_cast<CastInst>(V)) - return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0)); - - // Special cases for calls as we need to check for zeroext - // TODO We should accept calls even if they don't have zeroext, as they can - // still be sinks. - if (auto *Call = dyn_cast<CallInst>(V)) - return isSupportedType(Call) && - Call->hasRetAttr(Attribute::AttrKind::ZExt); - - if (!isa<BinaryOperator>(V)) - return false; - - if (!isSupportedType(V)) - return false; - - return true; + return isa<BasicBlock>(V); } /// Check that the type of V would be promoted and that the original type is diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 60e5d7bf6098..24ca25f73e96 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -26,8 +26,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -69,6 +71,7 @@ STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk"); STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed"); STATISTIC(NumJTMoved, "Number of jump table destination blocks moved"); STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted"); +STATISTIC(NumLEInserted, "Number of LE backwards branches inserted"); static cl::opt<bool> AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), @@ -212,6 +215,7 @@ namespace { const ARMBaseInstrInfo *TII; const ARMSubtarget *STI; ARMFunctionInfo *AFI; + MachineDominatorTree *DT = nullptr; bool isThumb; bool isThumb1; bool isThumb2; @@ -224,6 +228,11 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -238,7 +247,7 @@ namespace { void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs); bool BBHasFallthrough(MachineBasicBlock *MBB); CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); - unsigned getCPELogAlign(const MachineInstr *CPEMI); + Align getCPEAlign(const MachineInstr *CPEMI); void scanFunctionJumpTables(); void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs); MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); @@ -327,8 +336,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() { const BasicBlockInfo &BBI = BBInfo[J]; dbgs() << format("%08x %bb.%u\t", BBI.Offset, J) << " kb=" << unsigned(BBI.KnownBits) - << " ua=" << unsigned(BBI.Unalign) - << " pa=" << unsigned(BBI.PostAlign) + << " ua=" << unsigned(BBI.Unalign) << " pa=" << Log2(BBI.PostAlign) << format(" size=%#x\n", BBInfo[J].Size); } }); @@ -349,6 +357,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { isPositionIndependentOrROPI = STI->getTargetLowering()->isPositionIndependent() || STI->isROPI(); AFI = MF->getInfo<ARMFunctionInfo>(); + DT = &getAnalysis<MachineDominatorTree>(); isThumb = AFI->isThumbFunction(); isThumb1 = AFI->isThumb1OnlyFunction(); @@ -357,9 +366,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { HasFarJump = false; bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB); - // This pass invalidates liveness information when it splits basic blocks. - MF->getRegInfo().invalidateLiveness(); - // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. MF->RenumberBlocks(); @@ -398,7 +404,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { // Functions with jump tables need an alignment of 4 because they use the ADR // instruction, which aligns the PC to 4 bytes before adding an offset. if (!T2JumpTables.empty()) - MF->ensureAlignment(2); + MF->ensureAlignment(Align(4)); /// Remove dead constant pool entries. MadeChange |= removeUnusedCPEntries(); @@ -487,8 +493,9 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) MachineBasicBlock *BB = MF->CreateMachineBasicBlock(); MF->push_back(BB); - // MachineConstantPool measures alignment in bytes. We measure in log2(bytes). - unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment()); + // MachineConstantPool measures alignment in bytes. + const Align MaxAlign(MCP->getConstantPoolAlignment()); + const unsigned MaxLogAlign = Log2(MaxAlign); // Mark the basic block as required by the const-pool. BB->setAlignment(MaxAlign); @@ -501,7 +508,8 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // alignment of all entries as long as BB is sufficiently aligned. Keep // track of the insertion point for each alignment. We are going to bucket // sort the entries as they are created. - SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end()); + SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxLogAlign + 1, + BB->end()); // Add all of the constants from the constant pool to the end block, use an // identity mapping of CPI's to CPE's. @@ -526,7 +534,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // Ensure that future entries with higher alignment get inserted before // CPEMI. This is bucket sort with iterators. - for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a) + for (unsigned a = LogAlign + 1; a <= MaxLogAlign; ++a) if (InsPoint[a] == InsAt) InsPoint[a] = CPEMI; @@ -640,29 +648,27 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI, return nullptr; } -/// getCPELogAlign - Returns the required alignment of the constant pool entry -/// represented by CPEMI. Alignment is measured in log2(bytes) units. -unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) { +/// getCPEAlign - Returns the required alignment of the constant pool entry +/// represented by CPEMI. +Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) { switch (CPEMI->getOpcode()) { case ARM::CONSTPOOL_ENTRY: break; case ARM::JUMPTABLE_TBB: - return isThumb1 ? 2 : 0; + return isThumb1 ? Align(4) : Align(1); case ARM::JUMPTABLE_TBH: - return isThumb1 ? 2 : 1; + return isThumb1 ? Align(4) : Align(2); case ARM::JUMPTABLE_INSTS: - return 1; + return Align(2); case ARM::JUMPTABLE_ADDRS: - return 2; + return Align(4); default: llvm_unreachable("unknown constpool entry kind"); } unsigned CPI = getCombinedIndex(CPEMI); assert(CPI < MCP->getConstants().size() && "Invalid constant pool index."); - unsigned Align = MCP->getConstants()[CPI].getAlignment(); - assert(isPowerOf2_32(Align) && "Invalid CPE alignment"); - return Log2_32(Align); + return Align(MCP->getConstants()[CPI].getAlignment()); } /// scanFunctionJumpTables - Do a scan of the function, building up @@ -687,7 +693,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { BBInfoVector &BBInfo = BBUtils->getBBInfo(); // The known bits of the entry block offset are determined by the function // alignment. - BBInfo.front().KnownBits = MF->getAlignment(); + BBInfo.front().KnownBits = Log2(MF->getAlignment()); // Compute block offsets and known bits. BBUtils->adjustBBOffsetsAfter(&MF->front()); @@ -824,11 +830,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { Scale = 2; // +-(offset_8*2) NegOk = true; break; - - case ARM::tLDRHi: - Bits = 5; - Scale = 2; // +(offset_5*2) - break; } // Remember that this is a user of a CP entry. @@ -885,6 +886,13 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) { MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { MachineBasicBlock *OrigBB = MI->getParent(); + // Collect liveness information at MI. + LivePhysRegs LRs(*MF->getSubtarget().getRegisterInfo()); + LRs.addLiveOuts(*OrigBB); + auto LivenessEnd = ++MachineBasicBlock::iterator(MI).getReverse(); + for (MachineInstr &LiveMI : make_range(OrigBB->rbegin(), LivenessEnd)) + LRs.stepBackward(LiveMI); + // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); @@ -913,6 +921,12 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // OrigBB branches to NewBB. OrigBB->addSuccessor(NewBB); + // Update live-in information in the new block. + MachineRegisterInfo &MRI = MF->getRegInfo(); + for (MCPhysReg L : LRs) + if (!MRI.isReserved(L)) + NewBB->addLiveIn(L); + // Update internal data structures to account for the newly inserted MBB. // This is almost the same as updateForInsertedWaterBlock, except that // the Water goes after OrigBB, not NewBB. @@ -1007,13 +1021,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, MachineBasicBlock* Water, CPUser &U, unsigned &Growth) { BBInfoVector &BBInfo = BBUtils->getBBInfo(); - unsigned CPELogAlign = getCPELogAlign(U.CPEMI); - unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); - unsigned NextBlockOffset, NextBlockAlignment; + const Align CPEAlign = getCPEAlign(U.CPEMI); + const unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPEAlign); + unsigned NextBlockOffset; + Align NextBlockAlignment; MachineFunction::const_iterator NextBlock = Water->getIterator(); if (++NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); - NextBlockAlignment = 0; } else { NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset; NextBlockAlignment = NextBlock->getAlignment(); @@ -1028,13 +1042,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, Growth = CPEEnd - NextBlockOffset; // Compute the padding that would go at the end of the CPE to align the next // block. - Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment); + Growth += offsetToAlignment(CPEEnd, NextBlockAlignment); // If the CPE is to be inserted before the instruction, that will raise // the offset of the instruction. Also account for unknown alignment padding // in blocks between CPE and the user. if (CPEOffset < UserOffset) - UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign); + UserOffset += Growth + UnknownPadding(MF->getAlignment(), Log2(CPEAlign)); } else // CPE fits in existing padding. Growth = 0; @@ -1200,8 +1214,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, // inserting islands between BB0 and BB1 makes other accesses out of range. MachineBasicBlock *UserBB = U.MI->getParent(); BBInfoVector &BBInfo = BBUtils->getBBInfo(); - unsigned MinNoSplitDisp = - BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI)); + const Align CPEAlign = getCPEAlign(U.CPEMI); + unsigned MinNoSplitDisp = BBInfo[UserBB->getNumber()].postOffset(CPEAlign); if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2) return false; for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();; @@ -1254,7 +1268,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, CPUser &U = CPUsers[CPUserIndex]; MachineInstr *UserMI = U.MI; MachineInstr *CPEMI = U.CPEMI; - unsigned CPELogAlign = getCPELogAlign(CPEMI); + const Align CPEAlign = getCPEAlign(CPEMI); MachineBasicBlock *UserMBB = UserMI->getParent(); BBInfoVector &BBInfo = BBUtils->getBBInfo(); const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; @@ -1267,7 +1281,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Size of branch to insert. unsigned Delta = isThumb1 ? 2 : 4; // Compute the offset where the CPE will begin. - unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta; + unsigned CPEOffset = UserBBI.postOffset(CPEAlign) + Delta; if (isOffsetInRange(UserOffset, CPEOffset, U)) { LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB) @@ -1308,11 +1322,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Try to split the block so it's fully aligned. Compute the latest split // point where we can add a 4-byte branch instruction, and then align to - // LogAlign which is the largest possible alignment in the function. - unsigned LogAlign = MF->getAlignment(); - assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry"); + // Align which is the largest possible alignment in the function. + const Align Align = MF->getAlignment(); + assert(Align >= CPEAlign && "Over-aligned constant pool entry"); unsigned KnownBits = UserBBI.internalKnownBits(); - unsigned UPad = UnknownPadding(LogAlign, KnownBits); + unsigned UPad = UnknownPadding(Align, KnownBits); unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad; LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x", BaseInsertOffset)); @@ -1323,7 +1337,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, BaseInsertOffset -= 4; LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset) - << " la=" << LogAlign << " kb=" << KnownBits + << " la=" << Log2(Align) << " kb=" << KnownBits << " up=" << UPad << '\n'); // This could point off the end of the block if we've already got constant @@ -1337,6 +1351,28 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, BaseInsertOffset = std::max(UserBBI.postOffset() - UPad - 8, UserOffset + TII->getInstSizeInBytes(*UserMI) + 1); + // If the CP is referenced(ie, UserOffset) is in first four instructions + // after IT, this recalculated BaseInsertOffset could be in the middle of + // an IT block. If it is, change the BaseInsertOffset to just after the + // IT block. This still make the CP Entry is in range becuase of the + // following reasons. + // 1. The initial BaseseInsertOffset calculated is (UserOffset + + // U.getMaxDisp() - UPad). + // 2. An IT block is only at most 4 instructions plus the "it" itself (18 + // bytes). + // 3. All the relevant instructions support much larger Maximum + // displacement. + MachineBasicBlock::iterator I = UserMI; + ++I; + for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI), + PredReg = 0; + I->getOpcode() != ARM::t2IT && + getITInstrPredicate(*I, PredReg) != ARMCC::AL; + Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) { + BaseInsertOffset = + std::max(BaseInsertOffset, Offset + TII->getInstSizeInBytes(*I) + 1); + assert(I != UserMBB->end() && "Fell off end of block"); + } LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset)); } unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad + @@ -1354,8 +1390,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, CPUser &U = CPUsers[CPUIndex]; if (!isOffsetInRange(Offset, EndInsertOffset, U)) { // Shift intertion point by one unit of alignment so it is within reach. - BaseInsertOffset -= 1u << LogAlign; - EndInsertOffset -= 1u << LogAlign; + BaseInsertOffset -= Align.value(); + EndInsertOffset -= Align.value(); } // This is overly conservative, as we don't account for CPEMIs being // reused within the block, but it doesn't matter much. Also assume CPEs @@ -1397,9 +1433,10 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, } // We really must not split an IT block. - LLVM_DEBUG(unsigned PredReg; assert( - !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL)); - +#ifndef NDEBUG + unsigned PredReg; + assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL); +#endif NewMBB = splitBlockBeforeInstr(&*MI); } @@ -1464,9 +1501,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, // Always align the new block because CP entries can be smaller than 4 // bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may // be an already aligned constant pool block. - const unsigned Align = isThumb ? 1 : 2; - if (NewMBB->getAlignment() < Align) - NewMBB->setAlignment(Align); + const Align Alignment = isThumb ? Align(2) : Align(4); + if (NewMBB->getAlignment() < Alignment) + NewMBB->setAlignment(Alignment); // Remove the original WaterList entry; we want subsequent insertions in // this vicinity to go after the one we're about to insert. This @@ -1495,7 +1532,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, decrementCPEReferenceCount(CPI, CPEMI); // Mark the basic block as aligned as required by the const-pool entry. - NewIsland->setAlignment(getCPELogAlign(U.CPEMI)); + NewIsland->setAlignment(getCPEAlign(U.CPEMI)); // Increase the size of the island block to account for the new entry. BBUtils->adjustBBSize(NewIsland, Size); @@ -1529,10 +1566,11 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { BBInfo[CPEBB->getNumber()].Size = 0; // This block no longer needs to be aligned. - CPEBB->setAlignment(0); - } else + CPEBB->setAlignment(Align::None()); + } else { // Entries are sorted by descending alignment, so realign from the front. - CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin())); + CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin())); + } BBUtils->adjustBBOffsetsAfter(CPEBB); // An island has only one predecessor BB and one successor BB. Check if @@ -1620,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { // L2: ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm(); CC = ARMCC::getOppositeCondition(CC); - unsigned CCReg = MI->getOperand(2).getReg(); + Register CCReg = MI->getOperand(2).getReg(); // If the branch is at the end of its MBB and that has a fall-through block, // direct the updated conditional branch to the fall-through block. Otherwise, @@ -1778,16 +1816,10 @@ bool ARMConstantIslands::optimizeThumb2Instructions() { return MadeChange; } + bool ARMConstantIslands::optimizeThumb2Branches() { - bool MadeChange = false; - // The order in which branches appear in ImmBranches is approximately their - // order within the function body. By visiting later branches first, we reduce - // the distance between earlier forward branches and their targets, making it - // more likely that the cbn?z optimization, which can only apply to forward - // branches, will succeed. - for (unsigned i = ImmBranches.size(); i != 0; --i) { - ImmBranch &Br = ImmBranches[i-1]; + auto TryShrinkBranch = [this](ImmBranch &Br) { unsigned Opcode = Br.MI->getOpcode(); unsigned NewOpc = 0; unsigned Scale = 1; @@ -1815,47 +1847,115 @@ bool ARMConstantIslands::optimizeThumb2Branches() { BBUtils->adjustBBSize(MBB, -2); BBUtils->adjustBBOffsetsAfter(MBB); ++NumT2BrShrunk; - MadeChange = true; + return true; } } + return false; + }; - Opcode = Br.MI->getOpcode(); - if (Opcode != ARM::tBcc) - continue; + struct ImmCompare { + MachineInstr* MI = nullptr; + unsigned NewOpc = 0; + }; + + auto FindCmpForCBZ = [this](ImmBranch &Br, ImmCompare &ImmCmp, + MachineBasicBlock *DestBB) { + ImmCmp.MI = nullptr; + ImmCmp.NewOpc = 0; // If the conditional branch doesn't kill CPSR, then CPSR can be liveout // so this transformation is not safe. if (!Br.MI->killsRegister(ARM::CPSR)) - continue; + return false; - NewOpc = 0; unsigned PredReg = 0; + unsigned NewOpc = 0; ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg); if (Pred == ARMCC::EQ) NewOpc = ARM::tCBZ; else if (Pred == ARMCC::NE) NewOpc = ARM::tCBNZ; - if (!NewOpc) - continue; - MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + else + return false; + // Check if the distance is within 126. Subtract starting offset by 2 // because the cmp will be eliminated. unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2; BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126) - continue; + return false; // Search backwards to find a tCMPi8 auto *TRI = STI->getRegisterInfo(); MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI); if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8) + return false; + + ImmCmp.MI = CmpMI; + ImmCmp.NewOpc = NewOpc; + return true; + }; + + auto TryConvertToLE = [this](ImmBranch &Br, ImmCompare &Cmp) { + if (Br.MI->getOpcode() != ARM::t2Bcc || !STI->hasLOB() || + STI->hasMinSize()) + return false; + + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (BBUtils->getOffsetOf(MBB) < BBUtils->getOffsetOf(DestBB) || + !BBUtils->isBBInRange(Br.MI, DestBB, 4094)) + return false; + + if (!DT->dominates(DestBB, MBB)) + return false; + + // We queried for the CBN?Z opcode based upon the 'ExitBB', the opposite + // target of Br. So now we need to reverse the condition. + Cmp.NewOpc = Cmp.NewOpc == ARM::tCBZ ? ARM::tCBNZ : ARM::tCBZ; + + MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), + TII->get(ARM::t2LE)); + MIB.add(Br.MI->getOperand(0)); + Br.MI->eraseFromParent(); + Br.MI = MIB; + ++NumLEInserted; + return true; + }; + + bool MadeChange = false; + + // The order in which branches appear in ImmBranches is approximately their + // order within the function body. By visiting later branches first, we reduce + // the distance between earlier forward branches and their targets, making it + // more likely that the cbn?z optimization, which can only apply to forward + // branches, will succeed. + for (ImmBranch &Br : reverse(ImmBranches)) { + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineBasicBlock *ExitBB = &MBB->back() == Br.MI ? + MBB->getFallThrough() : + MBB->back().getOperand(0).getMBB(); + + ImmCompare Cmp; + if (FindCmpForCBZ(Br, Cmp, ExitBB) && TryConvertToLE(Br, Cmp)) { + DestBB = ExitBB; + MadeChange = true; + } else { + FindCmpForCBZ(Br, Cmp, DestBB); + MadeChange |= TryShrinkBranch(Br); + } + + unsigned Opcode = Br.MI->getOpcode(); + if ((Opcode != ARM::tBcc && Opcode != ARM::t2LE) || !Cmp.NewOpc) continue; - unsigned Reg = CmpMI->getOperand(0).getReg(); + Register Reg = Cmp.MI->getOperand(0).getReg(); // Check for Kill flags on Reg. If they are present remove them and set kill // on the new CBZ. + auto *TRI = STI->getRegisterInfo(); MachineBasicBlock::iterator KillMI = Br.MI; bool RegKilled = false; do { @@ -1865,19 +1965,32 @@ bool ARMConstantIslands::optimizeThumb2Branches() { RegKilled = true; break; } - } while (KillMI != CmpMI); + } while (KillMI != Cmp.MI); // Create the new CBZ/CBNZ - MachineBasicBlock *MBB = Br.MI->getParent(); - LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI); + LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI); MachineInstr *NewBR = - BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc)) .addReg(Reg, getKillRegState(RegKilled)) .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); - CmpMI->eraseFromParent(); - Br.MI->eraseFromParent(); - Br.MI = NewBR; + + Cmp.MI->eraseFromParent(); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); BBInfo[MBB->getNumber()].Size -= 2; + + if (Br.MI->getOpcode() == ARM::tBcc) { + Br.MI->eraseFromParent(); + Br.MI = NewBR; + } else if (&MBB->back() != Br.MI) { + // We've generated an LE and already erased the original conditional + // branch. The CBN?Z is now used to branch to the other successor, so an + // unconditional branch terminator is now redundant. + MachineInstr *LastMI = &MBB->back(); + if (LastMI != Br.MI) { + BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize(); + LastMI->eraseFromParent(); + } + } BBUtils->adjustBBOffsetsAfter(MBB); ++NumCBZ; MadeChange = true; @@ -1931,8 +2044,8 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI, // of BaseReg, but only if the t2ADDrs can be removed. // + Some instruction other than t2ADDrs computing the entry. Not seen in // the wild, but we should be careful. - unsigned EntryReg = JumpMI->getOperand(0).getReg(); - unsigned BaseReg = LEAMI->getOperand(0).getReg(); + Register EntryReg = JumpMI->getOperand(0).getReg(); + Register BaseReg = LEAMI->getOperand(0).getReg(); CanDeleteLEA = true; BaseRegKill = false; @@ -2009,7 +2122,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI, // but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg // and is not clobbered / used. MachineInstr *RemovableAdd = nullptr; - unsigned EntryReg = JumpMI->getOperand(0).getReg(); + Register EntryReg = JumpMI->getOperand(0).getReg(); // Find the last ADD to set EntryReg MachineBasicBlock::iterator I(LEAMI); @@ -2106,7 +2219,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { // %idx = tLSLri %idx, 2 // %base = tLEApcrelJT // %t = tLDRr %base, %idx - unsigned BaseReg = User.MI->getOperand(0).getReg(); + Register BaseReg = User.MI->getOperand(0).getReg(); if (User.MI->getIterator() == User.MI->getParent()->begin()) continue; @@ -2116,7 +2229,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { !Shift->getOperand(2).isKill()) continue; IdxReg = Shift->getOperand(2).getReg(); - unsigned ShiftedIdxReg = Shift->getOperand(0).getReg(); + Register ShiftedIdxReg = Shift->getOperand(0).getReg(); // It's important that IdxReg is live until the actual TBB/TBH. Most of // the range is checked later, but the LEA might still clobber it and not @@ -2313,6 +2426,10 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { MachineFunction::iterator MBBI = ++JTBB->getIterator(); MF->insert(MBBI, NewBB); + // Copy live-in information to new block. + for (const MachineBasicBlock::RegisterMaskPair &RegMaskPair : BB->liveins()) + NewBB->addLiveIn(RegMaskPair); + // Add an unconditional branch from NewBB to BB. // There doesn't seem to be meaningful DebugInfo available; this doesn't // correspond directly to anything in the source. diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp index 3bdb0e1ef62d..72c95f441265 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index b32ba3eeea18..563fdda56104 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -481,7 +481,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { unsigned OpIdx = 0; bool DstIsDead = MI.getOperand(OpIdx).isDead(); - unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + Register DstReg = MI.getOperand(OpIdx++).getReg(); if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || TableEntry->RealOpc == ARM::VLD2DUPd16x2 || TableEntry->RealOpc == ARM::VLD2DUPd32x2) { @@ -492,7 +492,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { assert(RegSpc == OddDblSpc && "Unexpected spacing!"); SubRegIndex = ARM::dsub_1; } - unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex); + Register SubReg = TRI->getSubReg(DstReg, SubRegIndex); unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, &ARM::DPairSpcRegClass); MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead)); @@ -624,7 +624,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { bool SrcIsKill = MI.getOperand(OpIdx).isKill(); bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); - unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + Register SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, getUndefRegState(SrcIsUndef)); @@ -760,7 +760,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, } bool SrcIsKill = MI.getOperand(OpIdx).isKill(); - unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + Register SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0); @@ -789,6 +789,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) { case MachineOperand::MO_Immediate: case MachineOperand::MO_CImmediate: case MachineOperand::MO_FPImmediate: + case MachineOperand::MO_ShuffleMask: return false; case MachineOperand::MO_MachineBasicBlock: return true; @@ -828,7 +829,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned Opcode = MI.getOpcode(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); @@ -932,13 +933,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Dest = MI.getOperand(0); - unsigned TempReg = MI.getOperand(1).getReg(); + Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned DesiredReg = MI.getOperand(3).getReg(); - unsigned NewReg = MI.getOperand(4).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register DesiredReg = MI.getOperand(3).getReg(); + Register NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -1035,8 +1036,8 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg, unsigned Flags, bool IsThumb, const TargetRegisterInfo *TRI) { if (IsThumb) { - unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); - unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); + Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); + Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); MIB.addReg(RegLo, Flags); MIB.addReg(RegHi, Flags); } else @@ -1051,19 +1052,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); - unsigned TempReg = MI.getOperand(1).getReg(); + Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned DesiredReg = MI.getOperand(3).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register DesiredReg = MI.getOperand(3).getReg(); MachineOperand New = MI.getOperand(4); New.setIsKill(false); - unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); - unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); - unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); - unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); + Register DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); + Register DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); + Register DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); + Register DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -1204,8 +1205,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) NewMI->addOperand(MBBI->getOperand(i)); - // Delete the pseudo instruction TCRETURN. + + // Update call site info and delete the pseudo instruction TCRETURN. + MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI); MBB.erase(MBBI); + MBBI = NewMI; return true; } @@ -1336,7 +1340,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // for us. Otherwise, expand to nothing. if (RI.hasBasePointer(MF)) { int32_t NumBytes = AFI->getFramePtrSpillOffset(); - unsigned FramePtr = RI.getFrameRegister(MF); + Register FramePtr = RI.getFrameRegister(MF); assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) && "base pointer without frame pointer?"); @@ -1412,7 +1416,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(MF->getFunction().getContext(), "__aeabi_read_tp", PCLabelID, 0); - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); @@ -1435,6 +1439,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.cloneMemRefs(MI); TransferImpOps(MI, MIB, MIB); + MI.getMF()->moveCallSiteInfo(&MI, &*MIB); MI.eraseFromParent(); return true; } @@ -1442,7 +1447,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::t2LDRpci_pic: { unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) ? ARM::tLDRpci : ARM::t2LDRpci; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg) @@ -1464,7 +1469,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::LDRLIT_ga_pcrel_ldr: case ARM::tLDRLIT_ga_abs: case ARM::tLDRLIT_ga_pcrel: { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); auto Flags = MO1.getTargetFlags(); @@ -1522,7 +1527,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::t2MOV_ga_pcrel: { // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode. unsigned LabelId = AFI->createPICLabelUId(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); const GlobalValue *GV = MO1.getGlobal(); @@ -1586,7 +1591,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Grab the Q register destination. bool DstIsDead = MI.getOperand(OpIdx).isDead(); - unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + Register DstReg = MI.getOperand(OpIdx++).getReg(); // Copy the source register. MIB.add(MI.getOperand(OpIdx++)); @@ -1596,8 +1601,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.add(MI.getOperand(OpIdx++)); // Add the destination operands (D subregs). - unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0); - unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1); + Register D0 = TRI->getSubReg(DstReg, ARM::dsub_0); + Register D1 = TRI->getSubReg(DstReg, ARM::dsub_1); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); @@ -1617,7 +1622,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Grab the Q register source. bool SrcIsKill = MI.getOperand(OpIdx).isKill(); - unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + Register SrcReg = MI.getOperand(OpIdx++).getReg(); // Copy the destination register. MachineOperand Dst(MI.getOperand(OpIdx++)); @@ -1628,8 +1633,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.add(MI.getOperand(OpIdx++)); // Add the source operands (D subregs). - unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); - unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); + Register D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + Register D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0) .addReg(D1, SrcIsKill ? RegState::Kill : 0); @@ -1915,6 +1920,37 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::CMP_SWAP_64: return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI); + + case ARM::tBL_PUSHLR: + case ARM::BL_PUSHLR: { + const bool Thumb = Opcode == ARM::tBL_PUSHLR; + Register Reg = MI.getOperand(0).getReg(); + assert(Reg == ARM::LR && "expect LR register!"); + MachineInstrBuilder MIB; + if (Thumb) { + // push {lr} + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPUSH)) + .add(predOps(ARMCC::AL)) + .addReg(Reg); + + // bl __gnu_mcount_nc + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tBL)); + } else { + // stmdb sp!, {lr} + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::STMDB_UPD)) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)) + .addReg(Reg); + + // bl __gnu_mcount_nc + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL)); + } + MIB.cloneMemRefs(MI); + for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); + MI.eraseFromParent(); + return true; + } } } diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 6e274d269bf2..1fc5ff6921c6 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -191,8 +191,8 @@ class ARMFastISel final : public FastISel { bool isTypeLegal(Type *Ty, MVT &VT); bool isLoadTypeLegal(Type *Ty, MVT &VT); bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, - bool isZExt, bool isEquality); - bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + bool isZExt); + bool ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr, unsigned Alignment = 0, bool isZExt = true, bool allocReg = true); bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, @@ -219,15 +219,15 @@ class ARMFastISel final : public FastISel { bool Return, bool isVarArg); bool ProcessCallArgs(SmallVectorImpl<Value*> &Args, - SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<Register> &ArgRegs, SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, - SmallVectorImpl<unsigned> &RegArgs, + SmallVectorImpl<Register> &RegArgs, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg); unsigned getLibcallReg(const Twine &Name); - bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + bool FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, const Instruction *I, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg); bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call); @@ -301,7 +301,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); // Make sure the input operand is sufficiently constrained to be legal @@ -913,7 +913,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, AddOptionalDefs(MIB); } -bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, +bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr, unsigned Alignment, bool isZExt, bool allocReg) { unsigned Opc; bool useAM3 = false; @@ -1045,7 +1045,7 @@ bool ARMFastISel::SelectLoad(const Instruction *I) { Address Addr; if (!ARMComputeAddress(I->getOperand(0), Addr)) return false; - unsigned ResultReg; + Register ResultReg; if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment())) return false; updateValueMap(I, ResultReg); @@ -1259,8 +1259,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { if (ARMPred == ARMCC::AL) return false; // Emit the compare. - if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), - CI->isEquality())) + if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) return false; unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; @@ -1349,7 +1348,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) { } bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, - bool isZExt, bool isEquality) { + bool isZExt) { Type *Ty = Src1Value->getType(); EVT SrcEVT = TLI.getValueType(DL, Ty, true); if (!SrcEVT.isSimple()) return false; @@ -1397,19 +1396,11 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, // TODO: Verify compares. case MVT::f32: isICmp = false; - // Equality comparisons shouldn't raise Invalid on uordered inputs. - if (isEquality) - CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS; - else - CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES; + CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS; break; case MVT::f64: isICmp = false; - // Equality comparisons shouldn't raise Invalid on uordered inputs. - if (isEquality) - CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD; - else - CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED; + CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD; break; case MVT::i1: case MVT::i8: @@ -1485,8 +1476,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) { if (ARMPred == ARMCC::AL) return false; // Emit the compare. - if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), - CI->isEquality())) + if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) return false; // Now set a register based on the comparison. Explicitly set the predicates @@ -1893,10 +1883,10 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, } bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, - SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<Register> &ArgRegs, SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, - SmallVectorImpl<unsigned> &RegArgs, + SmallVectorImpl<Register> &RegArgs, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg) { @@ -1960,7 +1950,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; const Value *ArgVal = Args[VA.getValNo()]; - unsigned Arg = ArgRegs[VA.getValNo()]; + Register Arg = ArgRegs[VA.getValNo()]; MVT ArgVT = ArgVTs[VA.getValNo()]; assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) && @@ -2039,7 +2029,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, return true; } -bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, +bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, const Instruction *I, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg) { // Issue CALLSEQ_END @@ -2060,7 +2050,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, // double fp reg we want. MVT DestVT = RVLocs[0].getValVT(); const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT); - unsigned ResultReg = createResultReg(DstRC); + Register ResultReg = createResultReg(DstRC); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VMOVDRR), ResultReg) .addReg(RVLocs[0].getLocReg()) @@ -2081,7 +2071,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); - unsigned ResultReg = createResultReg(DstRC); + Register ResultReg = createResultReg(DstRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(RVLocs[0].getLocReg()); @@ -2162,7 +2152,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { } // Make the copy. - unsigned DstReg = VA.getLocReg(); + Register DstReg = VA.getLocReg(); const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) @@ -2231,7 +2221,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { // Set up the argument vectors. SmallVector<Value*, 8> Args; - SmallVector<unsigned, 8> ArgRegs; + SmallVector<Register, 8> ArgRegs; SmallVector<MVT, 8> ArgVTs; SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; Args.reserve(I->getNumOperands()); @@ -2247,8 +2237,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { if (!isTypeLegal(ArgTy, ArgVT)) return false; ISD::ArgFlagsTy Flags; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); + Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy))); Args.push_back(Op); ArgRegs.push_back(Arg); @@ -2257,13 +2246,13 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { } // Handle the arguments now that we've gotten them. - SmallVector<unsigned, 4> RegArgs; + SmallVector<Register, 4> RegArgs; unsigned NumBytes; if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes, false)) return false; - unsigned CalleeReg = 0; + Register CalleeReg; if (Subtarget->genLongCalls()) { CalleeReg = getLibcallReg(TLI.getLibcallName(Call)); if (CalleeReg == 0) return false; @@ -2282,7 +2271,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { MIB.addExternalSymbol(TLI.getLibcallName(Call)); // Add implicit physical register uses to the call. - for (unsigned R : RegArgs) + for (Register R : RegArgs) MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. @@ -2290,7 +2279,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Finish off the call including any return values. - SmallVector<unsigned, 4> UsedRegs; + SmallVector<Register, 4> UsedRegs; if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false; // Set all unused physreg defs as dead. @@ -2340,7 +2329,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Set up the argument vectors. SmallVector<Value*, 8> Args; - SmallVector<unsigned, 8> ArgRegs; + SmallVector<Register, 8> ArgRegs; SmallVector<MVT, 8> ArgVTs; SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; unsigned arg_size = CS.arg_size(); @@ -2377,12 +2366,11 @@ bool ARMFastISel::SelectCall(const Instruction *I, ArgVT != MVT::i1) return false; - unsigned Arg = getRegForValue(*i); - if (Arg == 0) + Register Arg = getRegForValue(*i); + if (!Arg.isValid()) return false; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); + Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy))); Args.push_back(*i); ArgRegs.push_back(Arg); @@ -2391,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, } // Handle the arguments now that we've gotten them. - SmallVector<unsigned, 4> RegArgs; + SmallVector<Register, 4> RegArgs; unsigned NumBytes; if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes, isVarArg)) @@ -2401,7 +2389,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); if (!GV || Subtarget->genLongCalls()) UseReg = true; - unsigned CalleeReg = 0; + Register CalleeReg; if (UseReg) { if (IntrMemName) CalleeReg = getLibcallReg(IntrMemName); @@ -2427,7 +2415,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, MIB.addExternalSymbol(IntrMemName, 0); // Add implicit physical register uses to the call. - for (unsigned R : RegArgs) + for (Register R : RegArgs) MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. @@ -2435,7 +2423,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Finish off the call including any return values. - SmallVector<unsigned, 4> UsedRegs; + SmallVector<Register, 4> UsedRegs; if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg)) return false; @@ -2476,7 +2464,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src, } bool RV; - unsigned ResultReg; + Register ResultReg; RV = ARMEmitLoad(VT, ResultReg, Src); assert(RV && "Should be able to handle this load."); RV = ARMEmitStore(VT, ResultReg, Dest); @@ -2506,7 +2494,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo()); - unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); unsigned SrcReg = FramePtr; // Recursively load frame address @@ -2947,7 +2935,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, Address Addr; if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false; - unsigned ResultReg = MI->getOperand(0).getReg(); + Register ResultReg = MI->getOperand(0).getReg(); if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false)) return false; MachineBasicBlock::iterator I(MI); @@ -2974,7 +2962,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, 4, 4); - unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); + Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index bedb779bcba0..01ae93086dcb 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -76,7 +76,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs); ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) - : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)), STI(sti) {} bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const { @@ -376,7 +376,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // to determine the end of the prologue. DebugLoc dl; - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); // Determine the sizes of each callee-save spill areas and record which frame // belongs to which callee-save spill areas. @@ -780,7 +780,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); int NumBytes = (int)MFI.getStackSize(); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -1503,11 +1503,17 @@ static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF, /// instructions will require a scratch register during their expansion later. // FIXME: Move to TII? static unsigned estimateRSStackSizeLimit(MachineFunction &MF, - const TargetFrameLowering *TFI) { + const TargetFrameLowering *TFI, + bool &HasNonSPFrameIndex) { const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned Limit = (1 << 12) - 1; for (auto &MBB : MF) { for (auto &MI : MBB) { + if (MI.isDebugInstr()) + continue; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isFI()) continue; @@ -1518,13 +1524,29 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, Limit = std::min(Limit, (1U << 8) - 1); break; } + // t2ADDri will not require an extra register, it can reuse the + // destination. + if (MI.getOpcode() == ARM::t2ADDri || MI.getOpcode() == ARM::t2ADDri12) + break; + + const MCInstrDesc &MCID = MI.getDesc(); + const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF); + if (RegClass && !RegClass->contains(ARM::SP)) + HasNonSPFrameIndex = true; // Otherwise check the addressing mode. switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) { + case ARMII::AddrMode_i12: + case ARMII::AddrMode2: + // Default 12 bit limit. + break; case ARMII::AddrMode3: case ARMII::AddrModeT2_i8: Limit = std::min(Limit, (1U << 8) - 1); break; + case ARMII::AddrMode5FP16: + Limit = std::min(Limit, ((1U << 8) - 1) * 2); + break; case ARMII::AddrMode5: case ARMII::AddrModeT2_i8s4: case ARMII::AddrModeT2_ldrex: @@ -1541,8 +1563,17 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, // Addressing modes 4 & 6 (load/store) instructions can't encode an // immediate offset for stack references. return 0; - default: + case ARMII::AddrModeT2_i7: + Limit = std::min(Limit, ((1U << 7) - 1) * 1); + break; + case ARMII::AddrModeT2_i7s2: + Limit = std::min(Limit, ((1U << 7) - 1) * 2); break; + case ARMII::AddrModeT2_i7s4: + Limit = std::min(Limit, ((1U << 7) - 1) * 4); + break; + default: + llvm_unreachable("Unhandled addressing mode in stack size limit calculation"); } break; // At most one FI per instruction } @@ -1623,7 +1654,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); (void)TRI; // Silence unused warning in non-assert builds. - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); // Spill R4 if Thumb2 function requires stack realignment - it will be used as // scratch register. Also spill R4 if Thumb2 function has varsized objects, @@ -1784,6 +1815,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, EstimatedStackSize += 16; // For possible paddings. unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit; + bool HasNonSPFrameIndex = false; if (AFI->isThumb1OnlyFunction()) { // For Thumb1, don't bother to iterate over the function. The only // instruction that requires an emergency spill slot is a store to a @@ -1804,7 +1836,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, EstimatedRSStackSizeLimit = (1U << 8) * 4; EstimatedRSFixedSizeLimit = (1U << 5) * 4; } else { - EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + EstimatedRSStackSizeLimit = + estimateRSStackSizeLimit(MF, this, HasNonSPFrameIndex); EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit; } // Final estimate of whether sp or bp-relative accesses might require @@ -1830,12 +1863,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit; bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP || - HasLargeArgumentList; + HasLargeArgumentList || HasNonSPFrameIndex; LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit - << "; EstimatedStack" << EstimatedStackSize - << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset - << "; BigFrameOffsets: " << BigFrameOffsets - << "\n"); + << "; EstimatedStack: " << EstimatedStackSize + << "; EstimatedFPStack: " << MaxFixedOffset - MaxFPOffset + << "; BigFrameOffsets: " << BigFrameOffsets << "\n"); if (BigFrameOffsets || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { AFI->setHasStackFrame(true); @@ -2080,9 +2112,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, ExtraCSSpill = true; } } - if (!ExtraCSSpill) { + if (!ExtraCSSpill && RS) { // Reserve a slot closest to SP or frame pointer. - assert(RS && "Register scavenging not provided"); LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n"); const TargetRegisterClass &RC = ARM::GPRRegClass; unsigned Size = TRI->getSpillSize(RC); @@ -2097,6 +2128,12 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, AFI->setLRIsSpilledForFarJump(true); } AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); + + // If we have the "returned" parameter attribute which guarantees that we + // return the value which was passed in r0 unmodified (e.g. C++ 'structors), + // record that fact for IPRA. + if (AFI->getPreservesR0()) + SavedRegs.set(ARM::R0); } MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 7544ca3c38d6..6d8aee597945 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -63,6 +63,11 @@ public: bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + bool isProfitableForNoCSROpt(const Function &F) const override { + // The no-CSR optimisation is bad for code size on ARM, because we can save + // many registers with a single PUSH/POP pair. + return false; + } private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index b349627b67b1..8f6515c423eb 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -139,6 +139,8 @@ public: bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); + template <unsigned Shift> + bool SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm); // Thumb 2 Addressing Modes: bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); @@ -146,9 +148,12 @@ public: SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); - template<unsigned Shift> - bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, - SDValue &OffImm); + template <unsigned Shift> + bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm); + bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm, + unsigned Shift); + template <unsigned Shift> + bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); @@ -179,6 +184,7 @@ private: bool tryARMIndexedLoad(SDNode *N); bool tryT1IndexedLoad(SDNode *N); bool tryT2IndexedLoad(SDNode *N); + bool tryMVEIndexedLoad(SDNode *N); /// SelectVLD - Select NEON load intrinsics. NumVecs should be /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for @@ -246,10 +252,6 @@ private: SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs, bool is64BitVector); - /// Returns the number of instructions required to materialize the given - /// constant in a register, or 3 if a literal pool load is needed. - unsigned ConstantMaterializationCost(unsigned Val) const; - /// Checks if N is a multiplication by a constant where we can extract out a /// power of two from the constant so that it can be used in a shift, but only /// if it simplifies the materialization of the constant. Returns true if it @@ -450,27 +452,6 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1)); } -unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { - if (Subtarget->isThumb()) { - if (Val <= 255) return 1; // MOV - if (Subtarget->hasV6T2Ops() && - (Val <= 0xffff || // MOV - ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW - ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN - return 1; - if (Val <= 510) return 2; // MOV + ADDi8 - if (~Val <= 255) return 2; // MOV + MVN - if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL - } else { - if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV - if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN - if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW - if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs - } - if (Subtarget->useMovt()) return 2; // MOVW + MOVT - return 3; // Literal pool load -} - bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, unsigned &PowerOfTwo, @@ -500,8 +481,8 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, // Only optimise if the new cost is better unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo); NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32); - unsigned OldCost = ConstantMaterializationCost(MulConstVal); - unsigned NewCost = ConstantMaterializationCost(NewMulConstVal); + unsigned OldCost = ConstantMaterializationCost(MulConstVal, Subtarget); + unsigned NewCost = ConstantMaterializationCost(NewMulConstVal, Subtarget); return NewCost < OldCost; } @@ -1172,6 +1153,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, return false; } +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80, + RHSC)) { + Base = N.getOperand(0); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + //===----------------------------------------------------------------------===// // Thumb 2 Addressing Modes @@ -1278,35 +1281,59 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, return false; } -template<unsigned Shift> -bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, - SDValue &Base, SDValue &OffImm) { - if (N.getOpcode() == ISD::SUB || - CurDAG->isBaseWithConstantOffset(N)) { - if (auto RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if (N.getOpcode() == ISD::SUB) - RHSC = -RHSC; - - if (isShiftedInt<7, Shift>(RHSC)) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex( +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80, + RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); - } - OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); - return true; } + + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; } } // Base only. Base = N; - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, + SDValue &OffImm) { + return SelectT2AddrModeImm7Offset(Op, N, OffImm, Shift); +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, + SDValue &OffImm, + unsigned Shift) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + int RHSC; + if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits. + OffImm = + ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) + ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32) + : CurDAG->getTargetConstant(-RHSC * (1 << Shift), SDLoc(N), + MVT::i32); + return true; + } + return false; +} + bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { @@ -1565,6 +1592,68 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) { return false; } +bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return false; + EVT LoadedVT = LD->getMemoryVT(); + if (!LoadedVT.isVector()) + return false; + bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + SDValue Offset; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + unsigned Align = LD->getAlignment(); + bool IsLE = Subtarget->isLittle(); + + if (Align >= 2 && LoadedVT == MVT::v4i16 && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) { + if (isSExtLd) + Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; + else + Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post; + } else if (LoadedVT == MVT::v8i8 && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + if (isSExtLd) + Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post; + else + Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post; + } else if (LoadedVT == MVT::v4i8 && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + if (isSExtLd) + Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; + else + Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; + } else if (Align >= 4 && + (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2)) + Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; + else if (Align >= 2 && + (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) + Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post; + else if ((IsLE || LoadedVT == MVT::v16i8) && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) + Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post; + else + return false; + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = {Base, Offset, + CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32), + CurDAG->getRegister(0, MVT::i32), Chain}; + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0), + MVT::i32, MVT::Other, Ops); + transferMemOperands(N, New); + ReplaceUses(SDValue(N, 0), SDValue(New, 1)); + ReplaceUses(SDValue(N, 1), SDValue(New, 0)); + ReplaceUses(SDValue(N, 2), SDValue(New, 2)); + CurDAG->RemoveDeadNode(N); + return true; +} + /// Form a GPRPair pseudo register from a pair of GPR regs. SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { SDLoc dl(V0.getNode()); @@ -2701,7 +2790,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: { unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); // If we can't materialize the constant we need to use a literal pool - if (ConstantMaterializationCost(Val) > 2) { + if (ConstantMaterializationCost(Val, Subtarget) > 2) { SDValue CPIdx = CurDAG->getTargetConstantPool( ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), TLI->getPointerTy(CurDAG->getDataLayout())); @@ -2842,8 +2931,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { bool PreferImmediateEncoding = Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm)); if (!PreferImmediateEncoding && - ConstantMaterializationCost(Imm) > - ConstantMaterializationCost(~Imm)) { + ConstantMaterializationCost(Imm, Subtarget) > + ConstantMaterializationCost(~Imm, Subtarget)) { // The current immediate costs more to materialize than a negated // immediate, so negate the immediate and use a BIC. SDValue NewImm = @@ -2987,6 +3076,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; } case ISD::LOAD: { + if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N)) + return; if (Subtarget->isThumb() && Subtarget->hasThumb2()) { if (tryT2IndexedLoad(N)) return; @@ -2998,13 +3089,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } - case ARMISD::WLS: { - SDValue Ops[] = { N->getOperand(1), // Loop count - N->getOperand(2), // Exit target + case ARMISD::WLS: + case ARMISD::LE: { + SDValue Ops[] = { N->getOperand(1), + N->getOperand(2), + N->getOperand(0) }; + unsigned Opc = N->getOpcode() == ARMISD::WLS ? + ARM::t2WhileLoopStart : ARM::t2LoopEnd; + SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceUses(N, New); + CurDAG->RemoveDeadNode(N); + return; + } + case ARMISD::LOOP_DEC: { + SDValue Ops[] = { N->getOperand(1), + N->getOperand(2), N->getOperand(0) }; - SDNode *LoopStart = - CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops); - ReplaceUses(N, LoopStart); + SDNode *Dec = + CurDAG->getMachineNode(ARM::t2LoopDec, dl, + CurDAG->getVTList(MVT::i32, MVT::Other), Ops); + ReplaceUses(N, Dec); CurDAG->RemoveDeadNode(N); return; } @@ -4365,7 +4469,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to // the original GPRs. - unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); SDValue Chain = SDValue(N,0); @@ -4401,7 +4505,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ // Copy REG_SEQ into a GPRPair-typed VR and replace the original two // i32 VRs of inline asm with it. - unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 18bb9bf3eccc..db26feb57010 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -245,7 +245,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; for (auto VT : IntTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -258,12 +258,31 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Legal); + setOperationAction(ISD::BSWAP, VT, Legal); + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + + // Vector reductions + setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -271,11 +290,18 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); } + + // Pre and Post inc are supported on loads and stores + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); + } } const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; for (auto VT : FloatTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); if (!HasMVEFP) setAllExpand(VT); @@ -287,6 +313,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Legal); + + // Pre and Post inc are supported on loads and stores + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); + } if (HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); @@ -314,7 +350,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { // vector types is inhibited at integer-only level. const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; for (auto VT : LongTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); setAllExpand(VT); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -334,6 +370,33 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + + // Pre and Post inc on these are legal, given the correct extends + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::v8i8, Legal); + setIndexedStoreAction(im, MVT::v8i8, Legal); + setIndexedLoadAction(im, MVT::v4i8, Legal); + setIndexedStoreAction(im, MVT::v4i8, Legal); + setIndexedLoadAction(im, MVT::v4i16, Legal); + setIndexedStoreAction(im, MVT::v4i16, Legal); + } + + // Predicate types + const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; + for (auto VT : pTypes) { + addRegisterClass(VT, &ARM::VCCRRegClass); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + } } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, @@ -645,8 +708,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } - for (MVT VT : MVT::vector_valuetypes()) { - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); addAllExtLoads(VT, InnerVT, Expand); } @@ -669,8 +732,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addMVEVectorTypes(Subtarget->hasMVEFloatOps()); // Combine low-overhead loop intrinsics so that we can lower i1 types. - if (Subtarget->hasLOB()) + if (Subtarget->hasLOB()) { setTargetDAGCombine(ISD::BRCOND); + setTargetDAGCombine(ISD::BR_CC); + } if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); @@ -837,10 +902,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); @@ -849,7 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, MVT::v2i32}) { - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); @@ -861,6 +922,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); } if (!Subtarget->hasFP64()) { @@ -901,9 +966,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); } - if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){ + if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFullFP16()) + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); } if (!Subtarget->hasFP16()) @@ -955,6 +1021,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); + if (Subtarget->hasDSP()) { + setOperationAction(ISD::SADDSAT, MVT::i8, Custom); + setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); + setOperationAction(ISD::SADDSAT, MVT::i16, Custom); + setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); + } + if (Subtarget->hasBaseDSP()) { + setOperationAction(ISD::SADDSAT, MVT::i32, Legal); + setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); + } // i64 operation support. setOperationAction(ISD::MUL, MVT::i64, Expand); @@ -972,6 +1048,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i64, Custom); setOperationAction(ISD::SRA, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl @@ -991,7 +1068,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // ARM does not have ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } @@ -1365,14 +1442,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. - setMinStackArgumentAlignment(4); + setMinStackArgumentAlignment(Align(4)); // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); - setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); + setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); - setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); + setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); if (Subtarget->isThumb() || Subtarget->isThumb2()) setTargetDAGCombine(ISD::ABS); @@ -1472,6 +1549,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::ADDE: return "ARMISD::ADDE"; case ARMISD::SUBC: return "ARMISD::SUBC"; case ARMISD::SUBE: return "ARMISD::SUBE"; + case ARMISD::LSLS: return "ARMISD::LSLS"; case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; @@ -1496,16 +1574,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; - case ARMISD::VCEQ: return "ARMISD::VCEQ"; - case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; - case ARMISD::VCGE: return "ARMISD::VCGE"; - case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; - case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; - case ARMISD::VCGEU: return "ARMISD::VCGEU"; - case ARMISD::VCGT: return "ARMISD::VCGT"; - case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; - case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; - case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; + case ARMISD::VCMP: return "ARMISD::VCMP"; + case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; case ARMISD::VSHLs: return "ARMISD::VSHLs"; @@ -1543,6 +1614,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTRN: return "ARMISD::VTRN"; case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VMOVN: return "ARMISD::VMOVN"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; @@ -1560,6 +1632,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; + case ARMISD::QADD16b: return "ARMISD::QADD16b"; + case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; + case ARMISD::QADD8b: return "ARMISD::QADD8b"; + case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; @@ -1589,6 +1665,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; case ARMISD::WLS: return "ARMISD::WLS"; + case ARMISD::LE: return "ARMISD::LE"; + case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; + case ARMISD::CSINV: return "ARMISD::CSINV"; + case ARMISD::CSNEG: return "ARMISD::CSNEG"; + case ARMISD::CSINC: return "ARMISD::CSINC"; } return nullptr; } @@ -1597,6 +1678,11 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(DL); + + // MVE has a predicate register. + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) + return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -1726,34 +1812,22 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, - ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { + ARMCC::CondCodes &CondCode2) { CondCode2 = ARMCC::AL; - InvalidOnQNaN = true; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: - case ISD::SETOEQ: - CondCode = ARMCC::EQ; - InvalidOnQNaN = false; - break; + case ISD::SETOEQ: CondCode = ARMCC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = ARMCC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; case ISD::SETOLE: CondCode = ARMCC::LS; break; - case ISD::SETONE: - CondCode = ARMCC::MI; - CondCode2 = ARMCC::GT; - InvalidOnQNaN = false; - break; + case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; - case ISD::SETUEQ: - CondCode = ARMCC::EQ; - CondCode2 = ARMCC::VS; - InvalidOnQNaN = false; - break; + case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; case ISD::SETUGT: CondCode = ARMCC::HI; break; case ISD::SETUGE: CondCode = ARMCC::PL; break; case ISD::SETLT: @@ -1761,10 +1835,7 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, case ISD::SETLE: case ISD::SETULE: CondCode = ARMCC::LE; break; case ISD::SETNE: - case ISD::SETUNE: - CondCode = ARMCC::NE; - InvalidOnQNaN = false; - break; + case ISD::SETUNE: CondCode = ARMCC::NE; break; } } @@ -1988,6 +2059,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); @@ -2112,6 +2184,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, "unexpected use of 'returned'"); isThisReturn = true; } + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), i); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); @@ -2347,12 +2422,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (isTailCall) { MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); + SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); @@ -2431,7 +2509,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = std::numeric_limits<int>::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!TargetRegisterInfo::isVirtualRegister(VR)) + if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -3047,12 +3125,12 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, // Load the current TEB (thread environment block) SDValue Ops[] = {Chain, - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32)}; + DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getTargetConstant(15, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(13, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(2, DL, MVT::i32)}; SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); @@ -3498,6 +3576,48 @@ SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, Op.getOperand(0)); } +SDValue ARMTargetLowering::LowerINTRINSIC_VOID( + SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { + unsigned IntNo = + cast<ConstantSDNode>( + Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) + ->getZExtValue(); + switch (IntNo) { + default: + return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::arm_gnu_eabi_mcount: { + MachineFunction &MF = DAG.getMachineFunction(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + // call "\01__gnu_mcount_nc" + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); + const uint32_t *Mask = + ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); + assert(Mask && "Missing call preserved mask for calling convention"); + // Mark LR an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + SDValue ReturnAddress = + DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); + std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; + SDValue Callee = + DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); + SDValue RegisterMask = DAG.getRegisterMask(Mask); + if (Subtarget->isThumb()) + return SDValue( + DAG.getMachineNode( + ARM::tBL_PUSHLR, dl, ResultTys, + {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), + DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), + 0); + return SDValue( + DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, + {ReturnAddress, Callee, RegisterMask, Chain}), + 0); + } + } +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { @@ -3898,6 +4018,12 @@ SDValue ARMTargetLowering::LowerFormalArguments( // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + + // If this value is passed in r0 and has the returned attribute (e.g. + // C++ 'structors), record this fact for later use. + if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { + AFI->setPreservesR0(); + } } // If this is an 8 or 16-bit value, it is really passed promoted @@ -4049,6 +4175,67 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, std::swap(LHS, RHS); } + // Thumb1 has very limited immediate modes, so turning an "and" into a + // shift can save multiple instructions. + // + // If we have (x & C1), and C1 is an appropriate mask, we can transform it + // into "((x << n) >> n)". But that isn't necessarily profitable on its + // own. If it's the operand to an unsigned comparison with an immediate, + // we can eliminate one of the shifts: we transform + // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". + // + // We avoid transforming cases which aren't profitable due to encoding + // details: + // + // 1. C2 fits into the immediate field of a cmp, and the transformed version + // would not; in that case, we're essentially trading one immediate load for + // another. + // 2. C1 is 255 or 65535, so we can use uxtb or uxth. + // 3. C2 is zero; we have other code for this special case. + // + // FIXME: Figure out profitability for Thumb2; we usually can't save an + // instruction, since the AND is always one instruction anyway, but we could + // use narrow instructions in some cases. + if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && + LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && + LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && + !isSignedIntSetCC(CC)) { + unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); + auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); + uint64_t RHSV = RHSC->getZExtValue(); + if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { + unsigned ShiftBits = countLeadingZeros(Mask); + if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { + SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); + LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); + RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); + } + } + } + + // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a + // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same + // way a cmp would. + // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and + // some tweaks to the heuristics for the previous and->shift transform. + // FIXME: Optimize cases where the LHS isn't a shift. + if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && + isa<ConstantSDNode>(RHS) && + cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && + CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && + cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { + unsigned ShiftAmt = + cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; + SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, + DAG.getVTList(MVT::i32, MVT::i32), + LHS.getOperand(0), + DAG.getConstant(ShiftAmt, dl, MVT::i32)); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, + Shift.getValue(1), SDValue()); + ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); + return Chain.getValue(1); + } + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); // If the RHS is a constant zero then the V (overflow) flag will never be @@ -4083,15 +4270,13 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const SDLoc &dl, - bool InvalidOnQNaN) const { + SelectionDAG &DAG, const SDLoc &dl) const { assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); SDValue Cmp; - SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) - Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); else - Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } @@ -4108,12 +4293,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { Cmp = Cmp.getOperand(0); Opc = Cmp.getOpcode(); if (Opc == ARMISD::CMPFP) - Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), - Cmp.getOperand(1), Cmp.getOperand(2)); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); else { assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); - Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), - Cmp.getOperand(1)); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); } return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } @@ -4276,6 +4459,35 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } +static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + EVT VT = Op.getValueType(); + if (!Subtarget->hasDSP()) + return SDValue(); + if (!VT.isSimple()) + return SDValue(); + + unsigned NewOpcode; + bool IsAdd = Op->getOpcode() == ISD::SADDSAT; + switch (VT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::i8: + NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; + break; + case MVT::i16: + NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; + break; + } + + SDLoc dl(Op); + SDValue Add = + DAG.getNode(NewOpcode, dl, MVT::i32, + DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), + DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); +} + SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); @@ -4656,10 +4868,62 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); + ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); + ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); + + if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && + LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { + unsigned TVal = CTVal->getZExtValue(); + unsigned FVal = CFVal->getZExtValue(); + unsigned Opcode = 0; + + if (TVal == ~FVal) { + Opcode = ARMISD::CSINV; + } else if (TVal == ~FVal + 1) { + Opcode = ARMISD::CSNEG; + } else if (TVal + 1 == FVal) { + Opcode = ARMISD::CSINC; + } else if (TVal == FVal + 1) { + Opcode = ARMISD::CSINC; + std::swap(TrueVal, FalseVal); + std::swap(TVal, FVal); + CC = ISD::getSetCCInverse(CC, true); + } + + if (Opcode) { + // If one of the constants is cheaper than another, materialise the + // cheaper one and let the csel generate the other. + if (Opcode != ARMISD::CSINC && + HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { + std::swap(TrueVal, FalseVal); + std::swap(TVal, FVal); + CC = ISD::getSetCCInverse(CC, true); + } + + // Attempt to use ZR checking TVal is 0, possibly inverting the condition + // to get there. CSINC not is invertable like the other two (~(~a) == a, + // -(-a) == a, but (a+1)+1 != a). + if (FVal == 0 && Opcode != ARMISD::CSINC) { + std::swap(TrueVal, FalseVal); + std::swap(TVal, FVal); + CC = ISD::getSetCCInverse(CC, true); + } + if (TVal == 0) + TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); + + // Drops F's value because we can get it by inverting/negating TVal. + FalseVal = TrueVal; + + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + EVT VT = TrueVal.getValueType(); + return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); + } + } if (isUnsupportedFloatingType(LHS.getValueType())) { DAG.getTargetLoweringInfo().softenSetCCOperands( - DAG, LHS.getValueType(), LHS, RHS, CC, dl); + DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4701,8 +4965,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } ARMCC::CondCodes CondCode, CondCode2; - bool InvalidOnQNaN; - FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); + FPCCToARMCC(CC, CondCode, CondCode2); // Normalize the fp compare. If RHS is zero we prefer to keep it there so we // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we @@ -4727,13 +4990,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. - SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); } return Result; @@ -4903,7 +5166,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { if (isUnsupportedFloatingType(LHS.getValueType())) { DAG.getTargetLoweringInfo().softenSetCCOperands( - DAG, LHS.getValueType(), LHS, RHS, CC, dl); + DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4960,11 +5223,10 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } ARMCC::CondCodes CondCode, CondCode2; - bool InvalidOnQNaN; - FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); + FPCCToARMCC(CC, CondCode, CondCode2); SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; @@ -5056,8 +5318,9 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + MakeLibCallOptions CallOptions; return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), - /*isSigned*/ false, SDLoc(Op)).first; + CallOptions, SDLoc(Op)).first; } return Op; @@ -5120,8 +5383,9 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + MakeLibCallOptions CallOptions; return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), - /*isSigned*/ false, SDLoc(Op)).first; + CallOptions, SDLoc(Op)).first; } return Op; @@ -5140,7 +5404,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (UseNEON) { // Use VBSL to copy the sign bit. - unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); + unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; @@ -5163,7 +5427,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); - SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), + SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, @@ -5243,7 +5507,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - unsigned FrameReg = ARI.getFrameRegister(MF); + Register FrameReg = ARI.getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, @@ -5253,9 +5517,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = StringSwitch<unsigned>(RegName) +Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg = StringSwitch<unsigned>(RegName) .Case("sp", ARM::SP) .Default(0); if (Reg) @@ -5576,8 +5840,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDLoc dl(N); EVT VT = N->getValueType(0); - if (VT.isVector()) { - assert(ST->hasNEON()); + if (VT.isVector() && ST->hasNEON()) { // Compute the least significant set bit: LSB = X & -X SDValue X = N->getOperand(0); @@ -5777,14 +6040,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, unsigned ShPartsOpc = ARMISD::LSLL; ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); - // If the shift amount is greater than 32 then do the default optimisation - if (Con && Con->getZExtValue() > 32) + // If the shift amount is greater than 32 or has a greater bitwidth than 64 + // then do the default optimisation + if (ShAmt->getValueType(0).getSizeInBits() > 64 || + (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) return SDValue(); - // Extract the lower 32 bits of the shift amount if it's an i64 - if (ShAmt->getValueType(0) == MVT::i64) - ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, - DAG.getConstant(0, dl, MVT::i32)); + // Extract the lower 32 bits of the shift amount if it's not an i32 + if (ShAmt->getValueType(0) != MVT::i32) + ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); if (ShOpc == ISD::SRL) { if (!Con) @@ -5839,20 +6103,37 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } -static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { - SDValue TmpOp0, TmpOp1; +static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { bool Invert = false; bool Swap = false; - unsigned Opc = 0; + unsigned Opc = ARMCC::AL; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDLoc dl(Op); + EVT CmpVT; + if (ST->hasNEON()) + CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); + else { + assert(ST->hasMVEIntegerOps() && + "No hardware support for integer vector comparison!"); + + if (Op.getValueType().getVectorElementType() != MVT::i1) + return SDValue(); + + // Make sure we expand floating point setcc to scalar if we do not have + // mve.fp, so that we can handle them from there. + if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) + return SDValue(); + + CmpVT = VT; + } + if (Op0.getValueType().getVectorElementType() == MVT::i64 && (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { // Special-case integer 64-bit equality comparisons. They aren't legal, @@ -5880,60 +6161,74 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); case ISD::SETUNE: - case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETNE: + if (ST->hasMVEFloatOps()) { + Opc = ARMCC::NE; break; + } else { + Invert = true; LLVM_FALLTHROUGH; + } case ISD::SETOEQ: - case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETOLT: case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGT: - case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETOLE: case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGE: - case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETGE: Opc = ARMCC::GE; break; case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; + case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; + case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; - case ISD::SETONE: + case ISD::SETONE: { // Expand this to (OLT | OGT). - TmpOp0 = Op0; - TmpOp1 = Op1; - Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); - break; - case ISD::SETUO: - Invert = true; - LLVM_FALLTHROUGH; - case ISD::SETO: + SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, + DAG.getConstant(ARMCC::GT, dl, MVT::i32)); + SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, + DAG.getConstant(ARMCC::GT, dl, MVT::i32)); + SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; + } + case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETO: { // Expand this to (OLT | OGE). - TmpOp0 = Op0; - TmpOp1 = Op1; - Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); - break; + SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, + DAG.getConstant(ARMCC::GT, dl, MVT::i32)); + SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, + DAG.getConstant(ARMCC::GE, dl, MVT::i32)); + SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; + } } } else { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); - case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; - case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETNE: + if (ST->hasMVEIntegerOps()) { + Opc = ARMCC::NE; break; + } else { + Invert = true; LLVM_FALLTHROUGH; + } + case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETGE: Opc = ARMCC::GE; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGT: Opc = ARMISD::VCGTU; break; + case ISD::SETUGT: Opc = ARMCC::HI; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGE: Opc = ARMISD::VCGEU; break; + case ISD::SETUGE: Opc = ARMCC::HS; break; } // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). - if (Opc == ARMISD::VCEQ) { + if (ST->hasNEON() && Opc == ARMCC::EQ) { SDValue AndOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) AndOp = Op0; @@ -5945,10 +6240,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { - Opc = ARMISD::VTST; Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); - Invert = !Invert; + SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); + if (!Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; } } } @@ -5962,31 +6259,20 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (ISD::isBuildVectorAllZeros(Op1.getNode())) SingleOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { - if (Opc == ARMISD::VCGE) - Opc = ARMISD::VCLEZ; - else if (Opc == ARMISD::VCGT) - Opc = ARMISD::VCLTZ; + if (Opc == ARMCC::GE) + Opc = ARMCC::LE; + else if (Opc == ARMCC::GT) + Opc = ARMCC::LT; SingleOp = Op1; } SDValue Result; if (SingleOp.getNode()) { - switch (Opc) { - case ARMISD::VCEQ: - Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCGE: - Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCLEZ: - Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCGT: - Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCLTZ: - Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; - default: - Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); - } + Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, + DAG.getConstant(Opc, dl, MVT::i32)); } else { - Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); + Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, + DAG.getConstant(Opc, dl, MVT::i32)); } Result = DAG.getSExtOrTrunc(Result, dl, VT); @@ -6027,13 +6313,13 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { CCR, Chain.getValue(1)); } -/// isNEONModifiedImm - Check if the specified splat value corresponds to a -/// valid vector constant for a NEON or MVE instruction with a "modified immediate" -/// operand (e.g., VMOV). If so, return the encoded value. -static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, +/// isVMOVModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON or MVE instruction with a "modified +/// immediate" operand (e.g., VMOV). If so, return the encoded value. +static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, bool is128Bits, - NEONModImmType type) { + VMOVModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a @@ -6163,10 +6449,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, } default: - llvm_unreachable("unexpected size for isNEONModifiedImm"); + llvm_unreachable("unexpected size for isVMOVModifiedImm"); } - unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); } @@ -6246,7 +6532,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, return SDValue(); // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). - SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), + SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); @@ -6263,7 +6549,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, } // Finally, try a VMVN.i32 - NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, + NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); @@ -6649,6 +6935,29 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) { return true; } +static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { + unsigned NumElts = VT.getVectorNumElements(); + // Make sure the mask has the right size. + if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) + return false; + + // If Top + // Look for <0, N, 2, N+2, 4, N+4, ..>. + // This inserts Input2 into Input1 + // else if not Top + // Look for <0, N+1, 2, N+3, 4, N+5, ..> + // This inserts Input1 into Input2 + unsigned Offset = Top ? 0 : 1; + for (unsigned i = 0; i < NumElts; i+=2) { + if (M[i] >= 0 && M[i] != (int)i) + return false; + if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) + return false; + } + + return true; +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -6669,6 +6978,66 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, return SDValue(); } +static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BoolMask; + unsigned BitsPerBool; + if (NumElts == 4) { + BitsPerBool = 4; + BoolMask = 0xf; + } else if (NumElts == 8) { + BitsPerBool = 2; + BoolMask = 0x3; + } else if (NumElts == 16) { + BitsPerBool = 1; + BoolMask = 0x1; + } else + return SDValue(); + + // If this is a single value copied into all lanes (a splat), we can just sign + // extend that single value + SDValue FirstOp = Op.getOperand(0); + if (!isa<ConstantSDNode>(FirstOp) && + std::all_of(std::next(Op->op_begin()), Op->op_end(), + [&FirstOp](SDUse &U) { + return U.get().isUndef() || U.get() == FirstOp; + })) { + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, + DAG.getValueType(MVT::i1)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); + } + + // First create base with bits set where known + unsigned Bits32 = 0; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (!isa<ConstantSDNode>(V) && !V.isUndef()) + continue; + bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); + if (BitSet) + Bits32 |= BoolMask << (i * BitsPerBool); + } + + // Add in unknown nodes + SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, + DAG.getConstant(Bits32, dl, MVT::i32)); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (isa<ConstantSDNode>(V) || V.isUndef()) + continue; + Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, + DAG.getConstant(i, dl, MVT::i32)); + } + + return Base; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -6677,6 +7046,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); EVT VT = Op.getValueType(); + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerBUILD_VECTOR_i1(Op, DAG, ST); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -6688,7 +7060,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { // Check if an immediate VMOV works. EVT VmovVT; - SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); @@ -6700,7 +7072,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); - Val = isNEONModifiedImm( + Val = isVMOVModifiedImm( NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); @@ -7088,9 +7460,6 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, LaneMask[j] = ExtractBase + j; } - // Final check before we try to produce nonsense... - if (!isShuffleMaskLegal(Mask, ShuffleVT)) - return SDValue(); // We can't handle more than two sources. This should have already // been checked before this point. @@ -7100,8 +7469,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; - SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], - ShuffleOps[1], Mask); + SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], Mask, DAG); + if (!Shuffle) + return SDValue(); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } @@ -7168,6 +7539,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize >= 32 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + ShuffleVectorInst::isIdentityMask(M) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16)) @@ -7180,6 +7552,9 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)) return true; + else if (Subtarget->hasMVEIntegerOps() && + (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) + return true; else return false; } @@ -7282,6 +7657,94 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, DAG.getConstant(ExtractNum, DL, MVT::i32)); } +static EVT getVectorTyFromPredicateVector(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4i1: + return MVT::v4i32; + case MVT::v8i1: + return MVT::v8i16; + case MVT::v16i1: + return MVT::v16i8; + default: + llvm_unreachable("Unexpected vector predicate type"); + } +} + +static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, + SelectionDAG &DAG) { + // Converting from boolean predicates to integers involves creating a vector + // of all ones or all zeroes and selecting the lanes based upon the real + // predicate. + SDValue AllOnes = + DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); + AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); + + SDValue AllZeroes = + DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); + AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); + + // Get full vector type from predicate type + EVT NewVT = getVectorTyFromPredicateVector(VT); + + SDValue RecastV1; + // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast + // this to a v16i1. This cannot be done with an ordinary bitcast because the + // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, + // since we know in hardware the sizes are really the same. + if (VT != MVT::v16i1) + RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); + else + RecastV1 = Pred; + + // Select either all ones or zeroes depending upon the real predicate bits. + SDValue PredAsVector = + DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); + + // Recast our new predicate-as-integer v16i8 vector into something + // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. + return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); +} + +static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + ArrayRef<int> ShuffleMask = SVN->getMask(); + + assert(ST->hasMVEIntegerOps() && + "No support for vector shuffle of boolean predicates"); + + SDValue V1 = Op.getOperand(0); + SDLoc dl(Op); + if (isReverseMask(ShuffleMask, VT)) { + SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); + SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); + } + + // Until we can come up with optimised cases for every single vector + // shuffle in existence we have chosen the least painful strategy. This is + // to essentially promote the boolean predicate to a 8-bit integer, where + // each predicate represents a byte. Then we fall back on a normal integer + // vector shuffle and convert the result back into a predicate vector. In + // many cases the generated code might be even better than scalar code + // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit + // fields in a register into 8 other arbitrary 2-bit fields! + SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); + EVT NewVT = PredAsVector.getValueType(); + + // Do the shuffle! + SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, + DAG.getUNDEF(NewVT), ShuffleMask); + + // Now return the result of comparing the shuffled vector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -7289,6 +7752,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + unsigned EltSize = VT.getScalarSizeInBits(); + + if (ST->hasMVEIntegerOps() && EltSize == 1) + return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again @@ -7298,7 +7765,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // of the same time so that they get CSEd properly. ArrayRef<int> ShuffleMask = SVN->getMask(); - unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize <= 32) { if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); @@ -7364,6 +7830,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, .getValue(WhichResult); } } + if (ST->hasMVEIntegerOps()) { + if (isVMOVNMask(ShuffleMask, VT, 0)) + return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, + DAG.getConstant(0, dl, MVT::i32)); + if (isVMOVNMask(ShuffleMask, VT, 1)) + return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, + DAG.getConstant(1, dl, MVT::i32)); + } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize // shuffles that produce a result larger than their operands with: @@ -7468,8 +7942,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue ARMTargetLowering:: -LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + + assert(ST->hasMVEIntegerOps() && + "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); + + SDValue Conv = + DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); + unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned LaneWidth = + getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; + unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, + Op.getOperand(1), DAG.getValueType(MVT::i1)); + SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, + DAG.getConstant(~Mask, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); +} + +SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa<ConstantSDNode>(Lane)) @@ -7477,6 +7972,11 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Elt = Op.getOperand(1); EVT EltVT = Elt.getValueType(); + + if (Subtarget->hasMVEIntegerOps() && + Op.getValueType().getScalarSizeInBits() == 1) + return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); + if (getTypeAction(*DAG.getContext(), EltVT) == TargetLowering::TypePromoteFloat) { // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, @@ -7505,13 +8005,37 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Op; } -static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + + assert(ST->hasMVEIntegerOps() && + "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); + + SDValue Conv = + DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); + unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned LaneWidth = + getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; + SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, + DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); + return Shift; +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); if (!isa<ConstantSDNode>(Lane)) return SDValue(); SDValue Vec = Op.getOperand(0); + EVT VT = Vec.getValueType(); + + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); + if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); @@ -7520,7 +8044,64 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + EVT Op2VT = V2.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + assert(Op1VT == Op2VT && "Operand types don't match!"); + assert(VT.getScalarSizeInBits() == 1 && + "Unexpected custom CONCAT_VECTORS lowering"); + assert(ST->hasMVEIntegerOps() && + "CONCAT_VECTORS lowering only supported for MVE"); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); + + // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + // Extract the vector elements from Op1 and Op2 one by one and truncate them + // to be the right size for the destination. For example, if Op1 is v4i1 then + // the promoted vector is v4i32. The result of concatentation gives a v8i1, + // which when promoted is v8i16. That means each i32 element from Op1 needs + // truncating to i16 and inserting in the result. + EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); + SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); + auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { + EVT NewVT = NewV.getValueType(); + EVT ConcatVT = ConVec.getValueType(); + for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, + DAG.getIntPtrConstant(i, dl)); + ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + return ConVec; + }; + unsigned j = 0; + ConVec = ExractInto(NewV1, ConVec, j); + ConVec = ExractInto(NewV2, ConVec, j); + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = Op->getValueType(0); + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerCONCAT_VECTORS_i1(Op, DAG, ST); + // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && @@ -7540,6 +8121,43 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); + + assert(VT.getScalarSizeInBits() == 1 && + "Unexpected custom EXTRACT_SUBVECTOR lowering"); + assert(ST->hasMVEIntegerOps() && + "EXTRACT_SUBVECTOR lowering only supported for MVE"); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + + // We now have Op1 promoted to a vector of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + EVT SubVT = MVT::getVectorVT(ElType, NumElts); + SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); + for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, + DAG.getIntPtrConstant(i, dl)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); +} + /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. @@ -7897,7 +8515,8 @@ static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, return N0; } -static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); @@ -7924,7 +8543,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); - N0 = LowerCONCAT_VECTORS(N0, DAG); + N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; @@ -7932,7 +8551,8 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { return LowerSDIV_v4i16(N0, N1, dl, DAG); } -static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && @@ -7960,7 +8580,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); - N0 = LowerCONCAT_VECTORS(N0, DAG); + N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, @@ -8255,6 +8875,96 @@ void ARMTargetLowering::ExpandDIV_Windows( Results.push_back(Upper); } +static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { + LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); + EVT MemVT = LD->getMemoryVT(); + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + "Expected a predicate type!"); + assert(MemVT == Op.getValueType()); + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Expected a non-extending load"); + assert(LD->isUnindexed() && "Expected a unindexed load"); + + // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit + // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We + // need to make sure that 8/4 bits are actually loaded into the correct + // place, which means loading the value and then shuffling the values into + // the bottom bits of the predicate. + // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect + // for BE). + + SDLoc dl(Op); + SDValue Load = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + LD->getMemOperand()); + SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); + if (MemVT != MVT::v16i1) + Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); +} + +static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { + StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); + EVT MemVT = ST->getMemoryVT(); + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + "Expected a predicate type!"); + assert(MemVT == ST->getValue().getValueType()); + assert(!ST->isTruncatingStore() && "Expected a non-extending store"); + assert(ST->isUnindexed() && "Expected a unindexed store"); + + // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits + // unset and a scalar store. + SDLoc dl(Op); + SDValue Build = ST->getValue(); + if (MemVT != MVT::v16i1) { + SmallVector<SDValue, 16> Ops; + for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, + DAG.getConstant(I, dl, MVT::i32))); + for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) + Ops.push_back(DAG.getUNDEF(MVT::i32)); + Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); + } + SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); + return DAG.getTruncStore( + ST->getChain(), dl, GRP, ST->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + ST->getMemOperand()); +} + +static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { + MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDValue PassThru = N->getPassThru(); + SDLoc dl(Op); + + auto IsZero = [](SDValue PassThru) { + return (ISD::isBuildVectorAllZeros(PassThru.getNode()) || + (PassThru->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(PassThru->getOperand(0)))); + }; + + if (IsZero(PassThru)) + return Op; + + // MVE Masked loads use zero as the passthru value. Here we convert undef to + // zero too, and other values are lowered to a select. + SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0, dl, MVT::i32)); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(), + N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); + SDValue Combo = NewLoad; + if (!PassThru.isUndef() && + (PassThru.getOpcode() != ISD::BITCAST || + !IsZero(PassThru->getOperand(0)))) + Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); + return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -8273,12 +8983,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, // Under Power Management extensions, the cycle-count is: // mrc p15, #0, <Rt>, c9, c13, #0 SDValue Ops[] = { N->getOperand(0), // Chain - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(9, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32) + DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getTargetConstant(15, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(9, DL, MVT::i32), + DAG.getTargetConstant(13, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32) }; SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, @@ -8412,6 +9122,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); @@ -8426,24 +9137,25 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); - case ISD::SETCC: return LowerVSETCC(Op, DAG); + case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ true); - return LowerSDIV(Op, DAG); + return LowerSDIV(Op, DAG, Subtarget); case ISD::UDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ false); - return LowerUDIV(Op, DAG); + return LowerUDIV(Op, DAG, Subtarget); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::SADDO: @@ -8452,6 +9164,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDO: case ISD::USUBO: return LowerUnsignedALUO(Op, DAG); + case ISD::SADDSAT: + case ISD::SSUBSAT: + return LowerSADDSUBSAT(Op, DAG, Subtarget); + case ISD::LOAD: + return LowerPredicateLoad(Op, DAG); + case ISD::STORE: + return LowerPredicateStore(Op, DAG); + case ISD::MLOAD: + return LowerMLOAD(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -8530,6 +9251,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res.getValue(0)); Results.push_back(Res.getValue(1)); return; + case ISD::SADDSAT: + case ISD::SSUBSAT: + Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); + break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; @@ -8600,19 +9325,19 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // orr r5, r5, #1 // add r5, pc // str r5, [$jbuf, #+4] ; &jbuf[1] - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); // Set the low bit because of thumb mode. - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(0x01) .add(predOps(ARMCC::AL)) .add(condCodeOp()); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) .addReg(NewVReg2, RegState::Kill) .addImm(PCLabelId); @@ -8630,28 +9355,28 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // orrs r1, r2 // add r2, $jbuf, #+4 ; &jbuf[1] // str r1, [r2] - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId); // Set the low bit because of thumb mode. - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) .addReg(ARM::CPSR, RegState::Define) .addImm(1) .add(predOps(ARMCC::AL)); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3, RegState::Kill) .add(predOps(ARMCC::AL)); - unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) .addFrameIndex(FI) .addImm(36); // &jbuf[1] :: pc @@ -8666,13 +9391,13 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // ldr r1, LCPI1_1 // add r1, pc, r1 // str r1, [$jbuf, #+4] ; &jbuf[1] - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) .addConstantPoolIndex(CPI) .addImm(0) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId) @@ -8794,7 +9519,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, bool IsPositionIndependent = isPositionIndependent(); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) @@ -8807,7 +9532,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(LPadList.size()) .add(predOps(ARMCC::AL)); } else { - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); @@ -8832,12 +9557,12 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(ARMCC::HI) .addReg(ARM::CPSR); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg1) @@ -8850,7 +9575,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg1) .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) .addFrameIndex(FI) .addImm(1) @@ -8873,7 +9598,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) @@ -8889,19 +9614,19 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(ARMCC::HI) .addReg(ARM::CPSR); - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg1) .addImm(2) .add(predOps(ARMCC::AL)); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) @@ -8911,7 +9636,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); - unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) .addReg(NewVReg4, RegState::Kill) .addImm(0) @@ -8932,7 +9657,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg6, RegState::Kill) .addJumpTableIndex(MJTI); } else { - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) @@ -8945,7 +9670,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(NumLPads) .add(predOps(ARMCC::AL)); } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); @@ -8974,7 +9699,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) @@ -8991,20 +9716,20 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(ARMCC::HI) .addReg(ARM::CPSR); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); - unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg4) @@ -9239,8 +9964,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); - unsigned dest = MI.getOperand(0).getReg(); - unsigned src = MI.getOperand(1).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); unsigned Align = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); @@ -9291,9 +10016,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { - unsigned srcOut = MRI.createVirtualRegister(TRC); - unsigned destOut = MRI.createVirtualRegister(TRC); - unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + Register srcOut = MRI.createVirtualRegister(TRC); + Register destOut = MRI.createVirtualRegister(TRC); + Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, @@ -9306,9 +10031,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) for (unsigned i = 0; i < BytesLeft; i++) { - unsigned srcOut = MRI.createVirtualRegister(TRC); - unsigned destOut = MRI.createVirtualRegister(TRC); - unsigned scratch = MRI.createVirtualRegister(TRC); + Register srcOut = MRI.createVirtualRegister(TRC); + Register destOut = MRI.createVirtualRegister(TRC); + Register scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, @@ -9351,7 +10076,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); // Load an immediate to varEnd. - unsigned varEnd = MRI.createVirtualRegister(TRC); + Register varEnd = MRI.createVirtualRegister(TRC); if (Subtarget->useMovt()) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) @@ -9401,12 +10126,12 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // destPhi = PHI(destLoop, dst) MachineBasicBlock *entryBB = BB; BB = loopMBB; - unsigned varLoop = MRI.createVirtualRegister(TRC); - unsigned varPhi = MRI.createVirtualRegister(TRC); - unsigned srcLoop = MRI.createVirtualRegister(TRC); - unsigned srcPhi = MRI.createVirtualRegister(TRC); - unsigned destLoop = MRI.createVirtualRegister(TRC); - unsigned destPhi = MRI.createVirtualRegister(TRC); + Register varLoop = MRI.createVirtualRegister(TRC); + Register varPhi = MRI.createVirtualRegister(TRC); + Register srcLoop = MRI.createVirtualRegister(TRC); + Register srcPhi = MRI.createVirtualRegister(TRC); + Register destLoop = MRI.createVirtualRegister(TRC); + Register destPhi = MRI.createVirtualRegister(TRC); BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) .addReg(varLoop).addMBB(loopMBB) @@ -9420,7 +10145,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) - unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, IsThumb1, IsThumb2); emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, @@ -9461,9 +10186,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { - unsigned srcOut = MRI.createVirtualRegister(TRC); - unsigned destOut = MRI.createVirtualRegister(TRC); - unsigned scratch = MRI.createVirtualRegister(TRC); + Register srcOut = MRI.createVirtualRegister(TRC); + Register destOut = MRI.createVirtualRegister(TRC); + Register scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, @@ -9523,7 +10248,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, break; case CodeModel::Large: { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); @@ -9771,8 +10496,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // equality. bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; - unsigned LHS1 = MI.getOperand(1).getReg(); - unsigned LHS2 = MI.getOperand(2).getReg(); + Register LHS1 = MI.getOperand(1).getReg(); + Register LHS2 = MI.getOperand(2).getReg(); if (RHSisZero) { BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS1) @@ -9782,8 +10507,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { - unsigned RHS1 = MI.getOperand(3).getReg(); - unsigned RHS2 = MI.getOperand(4).getReg(); + Register RHS1 = MI.getOperand(3).getReg(); + Register RHS2 = MI.getOperand(4).getReg(); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS1) .addReg(RHS1) @@ -9844,15 +10569,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Fn->insert(BBI, RSBBB); Fn->insert(BBI, SinkBB); - unsigned int ABSSrcReg = MI.getOperand(1).getReg(); - unsigned int ABSDstReg = MI.getOperand(0).getReg(); + Register ABSSrcReg = MI.getOperand(1).getReg(); + Register ABSDstReg = MI.getOperand(0).getReg(); bool ABSSrcKIll = MI.getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class - unsigned NewRsbDstReg = - MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); + Register NewRsbDstReg = MRI.createVirtualRegister( + isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, @@ -9931,7 +10656,7 @@ static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, // The MEMCPY both defines and kills the scratch registers. for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { - unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } @@ -10369,10 +11094,7 @@ static SDValue findMUL_LOHI(SDValue V) { static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - if (Subtarget->isThumb()) { - if (!Subtarget->hasDSP()) - return SDValue(); - } else if (!Subtarget->hasV5TEOps()) + if (!Subtarget->hasBaseDSP()) return SDValue(); // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and @@ -11253,7 +11975,7 @@ static SDValue PerformANDCombine(SDNode *N, BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; - SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), + SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VbicVT, VT.is128BitVector(), OtherModImm); @@ -11469,6 +12191,77 @@ static SDValue PerformORCombineToBFI(SDNode *N, return SDValue(); } +static bool isValidMVECond(unsigned CC, bool IsFloat) { + switch (CC) { + case ARMCC::EQ: + case ARMCC::NE: + case ARMCC::LE: + case ARMCC::GT: + case ARMCC::GE: + case ARMCC::LT: + return true; + case ARMCC::HS: + case ARMCC::HI: + return !IsFloat; + default: + return false; + }; +} + +static SDValue PerformORCombine_i1(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain + // together with predicates + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + ARMCC::CondCodes CondCode0 = ARMCC::AL; + ARMCC::CondCodes CondCode1 = ARMCC::AL; + if (N0->getOpcode() == ARMISD::VCMP) + CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) + ->getZExtValue(); + else if (N0->getOpcode() == ARMISD::VCMPZ) + CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) + ->getZExtValue(); + if (N1->getOpcode() == ARMISD::VCMP) + CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) + ->getZExtValue(); + else if (N1->getOpcode() == ARMISD::VCMPZ) + CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) + ->getZExtValue(); + + if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) + return SDValue(); + + unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); + unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); + + if (!isValidMVECond(Opposite0, + N0->getOperand(0)->getValueType(0).isFloatingPoint()) || + !isValidMVECond(Opposite1, + N1->getOperand(0)->getValueType(0).isFloatingPoint())) + return SDValue(); + + SmallVector<SDValue, 4> Ops0; + Ops0.push_back(N0->getOperand(0)); + if (N0->getOpcode() == ARMISD::VCMP) + Ops0.push_back(N0->getOperand(1)); + Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); + SmallVector<SDValue, 4> Ops1; + Ops1.push_back(N1->getOperand(0)); + if (N1->getOpcode() == ARMISD::VCMP) + Ops1.push_back(N1->getOperand(1)); + Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); + + SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); + SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); + SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); + return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, + DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); +} + /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -11489,7 +12282,7 @@ static SDValue PerformORCombine(SDNode *N, BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VorrVT; - SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VorrVT, VT.is128BitVector(), OtherModImm); @@ -11553,6 +12346,10 @@ static SDValue PerformORCombine(SDNode *N, } } + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + return PerformORCombine_i1(N, DCI, Subtarget); + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { @@ -11921,6 +12718,24 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return Vec; } +static SDValue +PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + SDLoc dl(N); + + // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) + if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { + // If the valuetypes are the same, we can remove the cast entirely. + if (Op->getOperand(0).getValueType() == VT) + return Op->getOperand(0); + return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, + Op->getOperand(0).getValueType(), Op->getOperand(0)); + } + + return SDValue(); +} + /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, @@ -12332,7 +13147,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N, // The canonical VMOV for a zero vector uses a 32-bit element size. unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); unsigned EltBits; - if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) @@ -12382,95 +13197,163 @@ static SDValue PerformLOADCombine(SDNode *N, return SDValue(); } -/// PerformSTORECombine - Target-specific dag combine xforms for -/// ISD::STORE. -static SDValue PerformSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - StoreSDNode *St = cast<StoreSDNode>(N); - if (St->isVolatile()) - return SDValue(); - - // Optimize trunc store (of multiple scalars) to shuffle and store. First, - // pack all of the elements in one place. Next, store to memory in fewer - // chunks. +// Optimize trunc store (of multiple scalars) to shuffle and store. First, +// pack all of the elements in one place. Next, store to memory in fewer +// chunks. +static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, + SelectionDAG &DAG) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); - if (St->isTruncatingStore() && VT.isVector()) { - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT StVT = St->getMemoryVT(); - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromEltSz = VT.getScalarSizeInBits(); - unsigned ToEltSz = StVT.getScalarSizeInBits(); + if (!St->isTruncatingStore() || !VT.isVector()) + return SDValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getScalarSizeInBits(); + unsigned ToEltSz = StVT.getScalarSizeInBits(); - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) + return SDValue(); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) + return SDValue(); - unsigned SizeRatio = FromEltSz / ToEltSz; - assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), - NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems * SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - SDLoc DL(St); - SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = DAG.getDataLayout().isBigEndian() - ? (i + 1) * SizeRatio - 1 - : i * SizeRatio; + SDLoc DL(St); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. + SDValue Shuff = DAG.getVectorShuffle( + WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. - // Find the largest store unit - MVT StoreType = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) - StoreType = Tp; - } - // Didn't find a legal store type. - if (!TLI.isTypeLegal(StoreType)) - return SDValue(); + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); - SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, - TLI.getPointerTy(DAG.getDataLayout())); - SDValue BasePtr = St->getBasePtr(); + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = + EVT::getVectorVT(*DAG.getContext(), StoreType, + VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue BasePtr = St->getBasePtr(); - // Perform one or more big stores into memory. - unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); - for (unsigned I = 0; I < E; I++) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - StoreType, ShuffWide, - DAG.getIntPtrConstant(I, DL)); - SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); - BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, - Increment); - Chains.push_back(Ch); - } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + // Perform one or more big stores into memory. + unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, + ShuffWide, DAG.getIntPtrConstant(I, DL)); + SDValue Ch = + DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + BasePtr = + DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); +} + +// Try taking a single vector store from an truncate (which would otherwise turn +// into an expensive buildvector) and splitting it into a series of narrowing +// stores. +static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, + SelectionDAG &DAG) { + if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) + return SDValue(); + SDValue Trunc = St->getValue(); + if (Trunc->getOpcode() != ISD::TRUNCATE) + return SDValue(); + EVT FromVT = Trunc->getOperand(0).getValueType(); + EVT ToVT = Trunc.getValueType(); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) + NumElements = 4; + if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0) + return SDValue(); + + SDLoc DL(St); + // Details about the old store + SDValue Ch = St->getChain(); + SDValue BasePtr = St->getBasePtr(); + unsigned Alignment = St->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); + AAMDNodes AAInfo = St->getAAInfo(); + + EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); + + SmallVector<SDValue, 4> Stores; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue Extract = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), + DAG.getConstant(i * NumElements, DL, MVT::i32)); + SDValue Store = DAG.getTruncStore( + Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), + NewToVT, Alignment, MMOFlags, AAInfo); + Stores.push_back(Store); } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + StoreSDNode *St = cast<StoreSDNode>(N); + if (St->isVolatile()) + return SDValue(); + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + + if (Subtarget->hasNEON()) + if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) + return Store; + + if (Subtarget->hasMVEIntegerOps()) + if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) + return NewToken; if (!ISD::isNormalStore(St)) return SDValue(); @@ -12522,7 +13405,7 @@ static SDValue PerformSTORECombine(SDNode *N, } // If this is a legal vector store, try to combine it into a VST1_UPD. - if (ISD::isNormalStore(N) && VT.isVector() && + if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); @@ -12890,6 +13773,71 @@ static SDValue PerformShiftCombine(SDNode *N, return SDValue(); } +// Look for a sign/zero extend of a larger than legal load. This can be split +// into two extending loads, which are simpler to deal with than an arbitrary +// sign extend. +static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::LOAD) + return SDValue(); + LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); + if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || + LD->getExtensionType() != ISD::NON_EXTLOAD) + return SDValue(); + EVT FromVT = LD->getValueType(0); + EVT ToVT = N->getValueType(0); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) + NumElements = 4; + if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || + FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0 || + !isPowerOf2_32(NumElements)) + return SDValue(); + + SDLoc DL(LD); + // Details about the old load + SDValue Ch = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + unsigned Alignment = LD->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + ISD::LoadExtType NewExtType = + N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); + EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); + EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NewOffset = NewFromVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + // Split the load in half, each side of which is extended separately. This + // is good enough, as legalisation will take it from there. They are either + // already legal or they will be split further into something that is + // legal. + SDValue NewLoad1 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, + LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); + SDValue NewLoad2 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment, MMOFlags, AAInfo); + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(NewLoad1.getNode(), 1), + SDValue(NewLoad2.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); +} + /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, @@ -12927,6 +13875,10 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, } } + if (ST->hasMVEIntegerOps()) + if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) + return NewLoad; + return SDValue(); } @@ -13028,43 +13980,169 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D return V; } +// Given N, the value controlling the conditional branch, search for the loop +// intrinsic, returning it, along with how the value is used. We need to handle +// patterns such as the following: +// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) +// (brcond (setcc (loop.decrement), 0, eq), exit) +// (brcond (setcc (loop.decrement), 0, ne), header) +static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, + bool &Negate) { + switch (N->getOpcode()) { + default: + break; + case ISD::XOR: { + if (!isa<ConstantSDNode>(N.getOperand(1))) + return SDValue(); + if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) + return SDValue(); + Negate = !Negate; + return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); + } + case ISD::SETCC: { + auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!Const) + return SDValue(); + if (Const->isNullValue()) + Imm = 0; + else if (Const->isOne()) + Imm = 1; + else + return SDValue(); + CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); + return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_iterations && + IntOp != Intrinsic::loop_decrement_reg) + return SDValue(); + return N; + } + } + return SDValue(); +} + static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { - // Look for (brcond (xor test.set.loop.iterations, -1) - SDValue CC = N->getOperand(1); - unsigned Opc = CC->getOpcode(); - SDValue Int; - if ((Opc == ISD::XOR || Opc == ISD::SETCC) && - (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) { + // The hwloop intrinsics that we're interested are used for control-flow, + // either for entering or exiting the loop: + // - test.set.loop.iterations will test whether its operand is zero. If it + // is zero, the proceeding branch should not enter the loop. + // - loop.decrement.reg also tests whether its operand is zero. If it is + // zero, the proceeding branch should not branch back to the beginning of + // the loop. + // So here, we need to check that how the brcond is using the result of each + // of the intrinsics to ensure that we're branching to the right place at the + // right time. - assert((isa<ConstantSDNode>(CC->getOperand(1)) && - cast<ConstantSDNode>(CC->getOperand(1))->isOne()) && - "Expected to compare against 1"); + ISD::CondCode CC; + SDValue Cond; + int Imm = 1; + bool Negate = false; + SDValue Chain = N->getOperand(0); + SDValue Dest; - Int = CC->getOperand(0); - } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN) - Int = CC; - else - return SDValue(); + if (N->getOpcode() == ISD::BRCOND) { + CC = ISD::SETEQ; + Cond = N->getOperand(1); + Dest = N->getOperand(2); + } else { + assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); + CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + Cond = N->getOperand(2); + Dest = N->getOperand(4); + if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { + if (!Const->isOne() && !Const->isNullValue()) + return SDValue(); + Imm = Const->getZExtValue(); + } else + return SDValue(); + } - unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue(); - if (IntOp != Intrinsic::test_set_loop_iterations) + SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); + if (!Int) return SDValue(); + if (Negate) + CC = ISD::getSetCCInverse(CC, true); + + auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { + return (CC == ISD::SETEQ && Imm == 0) || + (CC == ISD::SETNE && Imm == 1) || + (CC == ISD::SETLT && Imm == 1) || + (CC == ISD::SETULT && Imm == 1); + }; + + auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { + return (CC == ISD::SETEQ && Imm == 1) || + (CC == ISD::SETNE && Imm == 0) || + (CC == ISD::SETGT && Imm == 0) || + (CC == ISD::SETUGT && Imm == 0) || + (CC == ISD::SETGE && Imm == 1) || + (CC == ISD::SETUGE && Imm == 1); + }; + + assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && + "unsupported condition"); + SDLoc dl(Int); - SDValue Chain = N->getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDValue Elements = Int.getOperand(2); - SDValue ExitBlock = N->getOperand(2); + unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); + assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) + && "expected single br user"); + SDNode *Br = *N->use_begin(); + SDValue OtherTarget = Br->getOperand(1); - // TODO: Once we start supporting tail predication, we can add another - // operand to WLS for the number of elements processed in a vector loop. + // Update the unconditional branch to branch to the given Dest. + auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { + SDValue NewBrOps[] = { Br->getOperand(0), Dest }; + SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); + DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); + }; - SDValue Ops[] = { Chain, Elements, ExitBlock }; - SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); - DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); - return Res; + if (IntOp == Intrinsic::test_set_loop_iterations) { + SDValue Res; + // We expect this 'instruction' to branch when the counter is zero. + if (IsTrueIfZero(CC, Imm)) { + SDValue Ops[] = { Chain, Elements, Dest }; + Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + } else { + // The logic is the reverse of what we need for WLS, so find the other + // basic block target: the target of the proceeding br. + UpdateUncondBr(Br, Dest, DAG); + + SDValue Ops[] = { Chain, Elements, OtherTarget }; + Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + } + DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; + } else { + SDValue Size = DAG.getTargetConstant( + cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); + SDValue Args[] = { Int.getOperand(0), Elements, Size, }; + SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, + DAG.getVTList(MVT::i32, MVT::Other), Args); + DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); + + // We expect this instruction to branch when the count is not zero. + SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; + + // Update the unconditional branch to target the loop preheader if we've + // found the condition has been reversed. + if (Target == OtherTarget) + UpdateUncondBr(Br, Dest, DAG); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SDValue(LoopDec.getNode(), 1), Chain); + + SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; + return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); + } + return SDValue(); } /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. @@ -13298,14 +14376,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); - case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); + case ISD::BRCOND: + case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); - case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); @@ -13334,6 +14413,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); + case ARMISD::PREDICATE_CAST: + return PerformPREDICATE_CASTCombine(N, DCI); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); @@ -13348,7 +14429,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); break; } - case ARMISD::SMLALBB: { + case ARMISD::SMLALBB: + case ARMISD::QADD16b: + case ARMISD::QSUB16b: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || @@ -13384,6 +14467,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); break; } + case ARMISD::QADD8b: + case ARMISD::QSUB8b: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -13457,47 +14549,38 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, if (!Subtarget->hasMVEIntegerOps()) return false; - if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && - Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && - Ty != MVT::v2f64 && - // These are for truncated stores - Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16) - return false; - if (Subtarget->isLittle()) { - // In little-endian MVE, the store instructions VSTRB.U8, - // VSTRH.U16 and VSTRW.U32 all store the vector register in - // exactly the same format, and differ only in the range of - // their immediate offset field and the required alignment. - // - // In particular, VSTRB.U8 can store a vector at byte alignment. - // So at this stage we can simply say that loads/stores of all - // 128-bit wide vector types are permitted at any alignment, - // because we know at least _one_ instruction can manage that. - // - // Later on we might find that some of those loads are better - // generated as VLDRW.U32 if alignment permits, to take - // advantage of the larger immediate range. But for the moment, - // all that matters is that if we don't lower the load then - // _some_ instruction can handle it. + // These are for predicates + if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { + if (Fast) + *Fast = true; + return true; + } + + // These are for truncated stores/narrowing loads. They are fine so long as + // the alignment is at least the size of the item being loaded + if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && + Alignment >= VT.getScalarSizeInBits() / 8) { + if (Fast) + *Fast = true; + return true; + } + + // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and + // VSTRW.U32 all store the vector register in exactly the same format, and + // differ only in the range of their immediate offset field and the required + // alignment. So there is always a store that can be used, regardless of + // actual type. + // + // For big endian, that is not the case. But can still emit a (VSTRB.U8; + // VREV64.8) pair and get the same effect. This will likely be better than + // aligning the vector through the stack. + if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || + Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || + Ty == MVT::v2f64) { if (Fast) *Fast = true; return true; - } else { - // In big-endian MVE, those instructions aren't so similar - // after all, because they reorder the bytes of the vector - // differently. So this time we can only store a particular - // kind of vector if its alignment is at least the element - // type. And we can't store vectors of i64 or f64 at all - // without having to do some postprocessing, because there's - // no VSTRD.U64. - if (Ty == MVT::v16i8 || - ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || - ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { - if (Fast) - *Fast = true; - return true; - } } return false; @@ -13617,22 +14700,60 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) { /// sext/zext can be folded into vsubl. bool ARMTargetLowering::shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const { - if (!Subtarget->hasNEON() || !I->getType()->isVectorTy()) + if (!I->getType()->isVectorTy()) return false; - switch (I->getOpcode()) { - case Instruction::Sub: - case Instruction::Add: { - if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + if (Subtarget->hasNEON()) { + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + return true; + } + default: return false; - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - return true; + } } - default: + + if (!Subtarget->hasMVEIntegerOps()) + return false; + + auto IsSinker = [](Instruction *I, int Operand) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + return true; + case Instruction::Sub: + return Operand == 1; + default: + return false; + } + }; + + int Op = 0; + if (!isa<ShuffleVectorInst>(I->getOperand(Op))) + Op = 1; + if (!IsSinker(I, Op)) + return false; + if (!match(I->getOperand(Op), + m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_Zero()))) { return false; } - return false; + Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Shuffle->uses()) { + Instruction *Insn = cast<Instruction>(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(Op)); + return true; } bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { @@ -13641,6 +14762,11 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { if (!isTypeLegal(VT)) return false; + if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { + if (Ld->isExpandingLoad()) + return false; + } + // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no @@ -14028,6 +15154,52 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, return false; } +static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, + bool isSEXTLoad, bool isLE, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + if (!isa<ConstantSDNode>(Ptr->getOperand(1))) + return false; + + ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); + int RHSC = (int)RHS->getZExtValue(); + + auto IsInRange = [&](int RHSC, int Limit, int Scale) { + if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { + isInc = Ptr->getOpcode() == ISD::ADD; + Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } + return false; + }; + + // Try to find a matching instruction based on s/zext, Alignment, Offset and + // (in BE) type. + Base = Ptr->getOperand(0); + if (VT == MVT::v4i16) { + if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) + return true; + } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { + if (IsInRange(RHSC, 0x80, 1)) + return true; + } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) && + IsInRange(RHSC, 0x80, 4)) + return true; + else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) && + IsInRange(RHSC, 0x80, 2)) + return true; + else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) + return true; + return false; +} + /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. @@ -14041,25 +15213,35 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, EVT VT; SDValue Ptr; + unsigned Align; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); - VT = LD->getMemoryVT(); + VT = LD->getMemoryVT(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); - VT = ST->getMemoryVT(); + VT = ST->getMemoryVT(); + Align = ST->getAlignment(); } else return false; bool isInc; bool isLegal = false; - if (Subtarget->isThumb2()) - isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, - Offset, isInc, DAG); - else - isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, - Offset, isInc, DAG); + if (VT.isVector()) + isLegal = Subtarget->hasMVEIntegerOps() && + getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, + Subtarget->isLittle(), Base, Offset, + isInc, DAG); + else { + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + } if (!isLegal) return false; @@ -14077,15 +15259,18 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; + unsigned Align; bool isSEXTLoad = false, isNonExt; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { - VT = LD->getMemoryVT(); + VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { - VT = ST->getMemoryVT(); + VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); + Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); } else return false; @@ -14108,12 +15293,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isInc; bool isLegal = false; - if (Subtarget->isThumb2()) - isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, - isInc, DAG); - else - isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + if (VT.isVector()) + isLegal = Subtarget->hasMVEIntegerOps() && + getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, + Subtarget->isLittle(), Base, Offset, isInc, DAG); + else { + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + } if (!isLegal) return false; @@ -14369,7 +15561,8 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { /// constraint it is for this target. ARMTargetLowering::ConstraintType ARMTargetLowering::getConstraintType(StringRef Constraint) const { - if (Constraint.size() == 1) { + unsigned S = Constraint.size(); + if (S == 1) { switch (Constraint[0]) { default: break; case 'l': return C_RegisterClass; @@ -14377,12 +15570,12 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const { case 'h': return C_RegisterClass; case 'x': return C_RegisterClass; case 't': return C_RegisterClass; - case 'j': return C_Other; // Constant for movw. - // An address with a single base register. Due to the way we - // currently handle addresses it is the same as an 'r' memory constraint. + case 'j': return C_Immediate; // Constant for movw. + // An address with a single base register. Due to the way we + // currently handle addresses it is the same as an 'r' memory constraint. case 'Q': return C_Memory; } - } else if (Constraint.size() == 2) { + } else if (S == 2) { switch (Constraint[0]) { default: break; case 'T': return C_RegisterClass; @@ -14535,7 +15728,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'j': // Constant suitable for movw, must be between 0 and // 65535. - if (Subtarget->hasV6T2Ops()) + if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) if (CVal >= 0 && CVal <= 65535) break; return; @@ -14643,7 +15836,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'N': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a constant between 0 and 31, for shift amounts. if (CVal >= 0 && CVal <= 31) break; @@ -14651,7 +15844,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'O': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between -508 and 508, for // ADD/SUB sp = sp + immediate. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) @@ -14874,6 +16067,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { // without FP16. So we must do a function call. SDLoc Loc(Op); RTLIB::Libcall LC; + MakeLibCallOptions CallOptions; if (SrcSz == 16) { // Instruction from 16 -> 32 if (Subtarget->hasFP16()) @@ -14884,7 +16078,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); SrcVal = - makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first; + makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first; } } @@ -14897,7 +16091,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); - return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first; + return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -14923,7 +16117,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_ROUND"); - return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first; } void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -15015,7 +16210,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -15030,7 +16225,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -15056,7 +16251,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -15077,7 +16272,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -15090,7 +16285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -15102,7 +16297,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -15112,7 +16307,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = 8; + Info.align = Align(8); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; @@ -15122,7 +16317,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 8; + Info.align = Align(8); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; @@ -15473,6 +16668,12 @@ bool ARMTargetLowering::isLegalInterleavedAccessType( return VecSize == 64 || VecSize % 128 == 0; } +unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { + if (Subtarget->hasNEON()) + return 4; + return TargetLoweringBase::getMaxSupportedInterleaveFactor(); +} + /// Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -15792,15 +16993,15 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, } /// Return the correct alignment for the current calling convention. -unsigned -ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const { +Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, + DataLayout DL) const { + const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); if (!ArgTy->isVectorTy()) - return DL.getABITypeAlignment(ArgTy); + return ABITypeAlign; // Avoid over-aligning vector parameters. It would require realigning the // stack and waste space for no real benefit. - return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); + return std::min(ABITypeAlign, DL.getStackAlignment()); } /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of @@ -15861,7 +17062,7 @@ void ARMTargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 1675ec59a354..53813fad5afd 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -103,6 +103,7 @@ class VectorType; ADDE, // Add using carry SUBC, // Sub with carry SUBE, // Sub using carry + LSLS, // Shift left producing carry VMOVRRD, // double to two gprs. VMOVDRR, // Two gprs to double. @@ -126,17 +127,13 @@ class VectorType; WIN__DBZCHK, // Windows' divide by zero check WLS, // Low-overhead loops, While Loop Start + LOOP_DEC, // Really a part of LE, performs the sub + LE, // Low-overhead loops, Loop End - VCEQ, // Vector compare equal. - VCEQZ, // Vector compare equal to zero. - VCGE, // Vector compare greater than or equal. - VCGEZ, // Vector compare greater than or equal to zero. - VCLEZ, // Vector compare less than or equal to zero. - VCGEU, // Vector compare unsigned greater than or equal. - VCGT, // Vector compare greater than. - VCGTZ, // Vector compare greater than zero. - VCLTZ, // Vector compare less than zero. - VCGTU, // Vector compare unsigned greater than. + PREDICATE_CAST, // Predicate cast for MVE i1 types + + VCMP, // Vector compare. + VCMPZ, // Vector compare to zero. VTST, // Vector test bits. // Vector shift by vector @@ -200,6 +197,7 @@ class VectorType; VTRN, // transpose VTBL1, // 1-register shuffle with mask VTBL2, // 2-register shuffle with mask + VMOVN, // MVE vmovn // Vector multiply long: VMULLs, // ...signed @@ -221,6 +219,12 @@ class VectorType; SMMLAR, // Signed multiply long, round and add SMMLSR, // Signed multiply long, subtract and round + // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for. + QADD8b, + QSUB8b, + QADD16b, + QSUB16b, + // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other // operations, but for ARM some BUILD_VECTORs are legal as-is and their @@ -243,6 +247,11 @@ class VectorType; // instructions. MEMCPY, + // V8.1MMainline condition select + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + // Vector load N-element structure to all lanes: VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD2DUP, @@ -539,7 +548,7 @@ class VectorType; Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; - unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + unsigned getMaxSupportedInterleaveFactor() const override; bool lowerInterleavedLoad(LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, @@ -608,8 +617,8 @@ class VectorType; void finalizeLowering(MachineFunction &MF) const override; /// Return the correct alignment for the current calling convention. - unsigned getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const override; + Align getABIAlignmentForCallingConv(Type *ArgTy, + DataLayout DL) const override; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; @@ -670,6 +679,8 @@ class VectorType; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; @@ -721,8 +732,8 @@ class VectorType; void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; @@ -814,7 +825,7 @@ class VectorType; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - const SDLoc &dl, bool InvalidOnQNaN) const; + const SDLoc &dl) const; SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; @@ -838,7 +849,7 @@ class VectorType; void setAllExpand(MVT VT); }; - enum NEONModImmType { + enum VMOVModImmType { VMOVModImm, VMVNModImm, MVEVMVNModImm, diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index bc93a058720c..1da32ad2af6c 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -188,6 +188,13 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { let DecoderMethod = "DecodeCCOutOperand"; } +// Transform to generate the inverse of a condition code during ISel +def inv_cond_XFORM : SDNodeXForm<imm, [{ + ARMCC::CondCodes CC = static_cast<ARMCC::CondCodes>(N->getZExtValue()); + return CurDAG->getTargetConstant(ARMCC::getOppositeCondition(CC), SDLoc(N), + MVT::i32); +}]>; + // VPT predicate def VPTPredNOperand : AsmOperandClass { @@ -401,6 +408,8 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, // mnemonic (when not in an IT block) or preclude it (when in an IT block). bit thumbArithFlagSetting = 0; + bit validForTailPredication = 0; + // If this is a pseudo instruction, mark it isCodeGenOnly. let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); @@ -412,6 +421,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, let TSFlags{14} = canXformTo16Bit; let TSFlags{18-15} = D.Value; let TSFlags{19} = thumbArithFlagSetting; + let TSFlags{20} = validForTailPredication; let Constraints = cstr; let Itinerary = itin; @@ -455,6 +465,7 @@ class AsmPseudoInst<string asm, dag iops, dag oops = (outs)> let isCodeGenOnly = 0; // So we get asm matcher for it. let AsmString = asm; let isPseudo = 1; + let hasNoSchedulingInfo = 1; } class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)> @@ -2282,7 +2293,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, let Inst{24} = SIMM{7}; let Inst{18-16} = SIMM{6-4}; let Inst{3-0} = SIMM{3-0}; - let DecoderMethod = "DecodeNEONModImmInstruction"; + let DecoderMethod = "DecodeVMOVModImmInstruction"; } // NEON 2 vector register format. @@ -2724,6 +2735,16 @@ def complexrotateopodd : Operand<i32> { let PrintMethod = "printComplexRotationOp<180, 90>"; } +def MveSaturateOperand : AsmOperandClass { + let PredicateMethod = "isMveSaturateOp"; + let DiagnosticString = "saturate operand must be 48 or 64"; + let Name = "MveSaturate"; +} +def saturateop : Operand<i32> { + let ParserMatchClass = MveSaturateOperand; + let PrintMethod = "printMveSaturateOp"; +} + // Data type suffix token aliases. Implements Table A7-3 in the ARM ARM. def : TokenAlias<".s8", ".i8">; def : TokenAlias<".u8", ".i8">; diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index 388c889349b7..a802d5a06f07 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -117,7 +117,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + Register Reg = MI->getOperand(0).getReg(); MachineInstrBuilder MIB; MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg) diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index e35145463852..fe696222ec70 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -51,8 +51,6 @@ def SDT_ARMAnd : SDTypeProfile<1, 2, SDTCisVT<2, i32>]>; def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; -def SDT_ARMFCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; @@ -108,14 +106,24 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>, // TODO Add another operand for 'Size' so that we can re-use this node when we // start supporting *TP versions. -def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, - SDTCisVT<1, OtherVT>]>; +def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, OtherVT>]>; def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>; def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>; def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>; def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>; +def SDT_ARMCSel : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<3>, + SDTCisVT<3, i32>]>; + +def ARMcsinv : SDNode<"ARMISD::CSINV", SDT_ARMCSel, [SDNPOptInGlue]>; +def ARMcsneg : SDNode<"ARMISD::CSNEG", SDT_ARMCSel, [SDNPOptInGlue]>; +def ARMcsinc : SDNode<"ARMISD::CSINC", SDT_ARMCSel, [SDNPOptInGlue]>; + def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -194,6 +202,7 @@ def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags, [SDNPCommutative]>; def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>; +def ARMlsls : SDNode<"ARMISD::LSLS", SDTBinaryArithWithFlags>; def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>; def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>; @@ -229,6 +238,11 @@ def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; +def ARMqadd8b : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>; +def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>; +def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>; +def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>; + // Vector operations shared between NEON and MVE def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; @@ -265,8 +279,16 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; -def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop, - [SDNPHasChain]>; +def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, + SDTCisInt<3>]>; +def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>; + +def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>; +def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>; + +def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>; +def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>; +def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -1948,7 +1970,7 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii, /// the function. The first operand is the ID# for this instruction, the second /// is the index into the MachineConstantPool that this is, the third is the /// size in bytes of this constant pool entry. -let hasSideEffects = 0, isNotDuplicable = 1 in +let hasSideEffects = 0, isNotDuplicable = 1, hasNoSchedulingInfo = 1 in def CONSTPOOL_ENTRY : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; @@ -2361,6 +2383,12 @@ let isCall = 1, def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func), 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBr]>; + + // push lr before the call + def BL_PUSHLR : ARMPseudoInst<(outs), (ins GPRlr:$ra, arm_bl_target:$func), + 4, IIC_Br, + []>, + Requires<[IsARM]>, Sched<[WriteBr]>; } let isBranch = 1, isTerminator = 1 in { @@ -3727,6 +3755,23 @@ let DecoderMethod = "DecodeQADDInstruction" in [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>; } +def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b), + (QADD GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b), + (QSUB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), + (QDADD rGPR:$Rm, rGPR:$Rn)>; +def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), + (QDSUB rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn), + (QADD8 rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn), + (QSUB8 rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn), + (QADD16 rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn), + (QSUB16 rGPR:$Rm, rGPR:$Rn)>; + def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>; def UQADD8 : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>; def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>; @@ -4870,14 +4915,13 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>, let hasSideEffects = 1; } -let usesCustomInserter = 1, Defs = [CPSR] in { - -// Pseudo instruction that combines movs + predicated rsbmi -// to implement integer ABS +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { + // Pseudo instruction that combines movs + predicated rsbmi + // to implement integer ABS def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; } -let usesCustomInserter = 1, Defs = [CPSR] in { +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def COPY_STRUCT_BYVAL_I32 : PseudoInst< (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment), NoItinerary, @@ -5085,8 +5129,8 @@ def SWPB: AIswp<1, (outs GPRnopc:$Rt), def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]>, Requires<[IsARM,PreV8]> { bits<4> opc1; bits<4> CRn; @@ -5109,8 +5153,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]>, Requires<[IsARM,PreV8]> { let Inst{31-28} = 0b1111; bits<4> opc1; @@ -5289,15 +5333,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> { } } -defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; -defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; -defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; -defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm STC : LdStCop <0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; } // DecoderNamespace = "CoProc" @@ -5333,8 +5377,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]>, ComplexDeprecationPredicate<"MCR">; def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, @@ -5347,8 +5391,8 @@ def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm", (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; -def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), - (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : ARMPat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2), + (MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; class MovRCopro2<string opc, bit direction, dag oops, dag iops, list<dag> pattern> @@ -5379,8 +5423,8 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]>, Requires<[IsARM,PreV8]>; def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm", (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, @@ -5394,9 +5438,9 @@ def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm", (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0)>; -def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, - imm:$CRm, imm:$opc2), - (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : ARMV5TPat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, + timm:$CRm, timm:$opc2), + (MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag> pattern = []> @@ -5422,8 +5466,8 @@ class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag> def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), - [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, - GPRnopc:$Rt2, imm:$CRm)]>; + [(int_arm_mcrr timm:$cop, timm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, timm:$CRm)]>; def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */, (outs GPRnopc:$Rt, GPRnopc:$Rt2), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>; @@ -5455,8 +5499,8 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops, def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), - [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, - GPRnopc:$Rt2, imm:$CRm)]>; + [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, timm:$CRm)]>; def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */, (outs GPRnopc:$Rt, GPRnopc:$Rt2), @@ -5579,12 +5623,12 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn), def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, [SDNPHasChain, SDNPSideEffect]>; -let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in +let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; -let usesCustomInserter = 1, Defs = [CPSR] in +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary, [(win__dbzchk tGPR:$divisor)]>; @@ -6131,10 +6175,10 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm", def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>, ComplexDeprecationPredicate<"IT">; -let mayLoad = 1, mayStore =1, hasSideEffects = 1 in +let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), NoItinerary, - [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>; + [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>; //===---------------------------------- // Atomic cmpxchg for -O0 @@ -6174,4 +6218,5 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary, let hasSideEffects = 1; let Size = 0; let AsmString = "@ COMPILER BARRIER"; + let hasNoSchedulingInfo = 1; } diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index 3e7ae55c7fc8..4f67cd6e47cc 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -160,7 +160,8 @@ class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass { let RenderMethod = "addMemImmOffsetOperands"; } -class taddrmode_imm7<int shift> : MemOperand { +class taddrmode_imm7<int shift> : MemOperand, + ComplexPattern<i32, 2, "SelectTAddrModeImm7<"#shift#">", []> { let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>; // They are printed the same way as the T2 imm8 version let PrintMethod = "printT2AddrModeImm8Operand<false>"; @@ -221,7 +222,9 @@ def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>; def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>; def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>; -class t2am_imm7_offset<int shift> : MemOperand { +class t2am_imm7_offset<int shift> : MemOperand, + ComplexPattern<i32, 1, "SelectT2AddrModeImm7Offset<"#shift#">", + [], [SDNPWantRoot]> { // They are printed the same way as the imm8 version let PrintMethod = "printT2AddrModeImm8OffsetOperand"; let ParserMatchClass = @@ -371,6 +374,8 @@ class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]> let Inst{7-6} = 0b00; let Inst{5-4} = op5_4{1-0}; let Inst{3-0} = 0b1101; + + let Unpredictable{8-6} = 0b111; } def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>; @@ -403,18 +408,17 @@ class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16, let Inst{3-0} = 0b1111; } -class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16, - list<dag> pattern=[]> +class MVE_ScalarShiftDRegRegBase<string iname, dag iops, string asm, + bit op5, bit op16, list<dag> pattern=[]> : MVE_ScalarShiftDoubleReg< - iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm), - "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo," - "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo," + "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", pattern> { bits<4> Rm; let Inst{16} = op16; let Inst{15-12} = Rm{3-0}; - let Inst{7-6} = 0b00; + let Inst{6} = 0b0; let Inst{5} = op5; let Inst{4} = 0b0; let Inst{3-0} = 0b1101; @@ -427,27 +431,44 @@ class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16, let DecoderMethod = "DecodeMVEOverlappingLongShift"; } -def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, +class MVE_ScalarShiftDRegReg<string iname, bit op5, list<dag> pattern=[]> + : MVE_ScalarShiftDRegRegBase< + iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm), + "$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> { + + let Inst{7} = 0b0; +} + +class MVE_ScalarShiftDRegRegWithSat<string iname, bit op5, list<dag> pattern=[]> + : MVE_ScalarShiftDRegRegBase< + iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat), + "$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> { + bit sat; + + let Inst{7} = sat; +} + +def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMasrl tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm))]>; def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMasrl tGPREven:$RdaLo_src, - tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; -def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; +def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsll tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm))]>; def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsll tGPREven:$RdaLo_src, - tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; + tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsrl tGPREven:$RdaLo_src, - tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; + tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; -def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1, 0b1>; +def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>; def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>; def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>; -def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0, 0b1>; +def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>; def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>; def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>; @@ -531,6 +552,19 @@ defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; + def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))), + (i32 (MVE_VADDVu32acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))), + (i32 (MVE_VADDVu16acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))), + (i32 (MVE_VADDVu8acc $src2, $src1))>; + +} + class MVE_VADDLV<string iname, string suffix, dag iops, string cstr, bit A, bit U, list<dag> pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, @@ -636,6 +670,35 @@ multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> { defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>; defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>; +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), + (i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>; + def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))), + (i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>; + def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))), + (i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>; + def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))), + (i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>; + def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))), + (i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>; + def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))), + (i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>; + + def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))), + (i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>; + def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))), + (i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>; + def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))), + (i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>; + def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))), + (i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>; + def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))), + (i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>; + def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))), + (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>; + +} + multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> { def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>; def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>; @@ -667,57 +730,57 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, let Inst{0} = bit_0; } -multiclass MVE_VMLAMLSDAV_X<string iname, string suffix, dag iops, string cstr, - bit sz, bit bit_28, bit A, bit bit_8, bit bit_0, - list<dag> pattern=[]> { - def _noexch : MVE_VMLAMLSDAV<iname, suffix, iops, cstr, sz, - bit_28, A, 0b0, bit_8, bit_0, pattern>; - def _exch : MVE_VMLAMLSDAV<iname # "x", suffix, iops, cstr, sz, - bit_28, A, 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix, + bit sz, bit bit_28, bit X, bit bit_8, bit bit_0, + list<dag> pattern=[]> { + def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix, + (ins MQPR:$Qn, MQPR:$Qm), "", + sz, bit_28, 0b0, X, bit_8, bit_0, pattern>; + def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix, + (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm), + "$RdaDest = $RdaSrc", + sz, bit_28, 0b1, X, bit_8, bit_0, pattern>; } -multiclass MVE_VMLAMLSDAV_XA<string iname, string suffix, bit sz, bit bit_28, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - defm _noacc : MVE_VMLAMLSDAV_X<iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", - sz, bit_28, 0b0, bit_8, bit_0, pattern>; - defm _acc : MVE_VMLAMLSDAV_X<iname # "a", suffix, - (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm), - "$RdaDest = $RdaSrc", - sz, bit_28, 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28, + bit bit_8, bit bit_0, list<dag> pattern=[]> { + defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28, + 0b0, bit_8, bit_0, pattern>; + defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28, + 0b1, bit_8, bit_0, pattern>; } -multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit U, bit bit_8, - list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>; +multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8, + list<dag> pattern=[]> { + defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix, + sz, 0b0, bit_8, 0b0, pattern>; + defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix, + sz, 0b1, 0b0, bit_8, 0b0, pattern>; } -defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>; -defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>; -defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>; -defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>; +multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28, + list<dag> pattern=[]> { + defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix, + sz, bit_28, 0b0, 0b1, pattern>; +} + +defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>; -defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>; -defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>; // vmlav aliases vmladav -foreach acc = ["_acc", "_noacc"] in { +foreach acc = ["", "a"] in { foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in { - def : MVEInstAlias<!strconcat("vmlav", !if(!eq(acc, "_acc"), "a", ""), - "${vp}.", suffix, "\t$RdaDest, $Qn, $Qm"), - (!cast<Instruction>("MVE_VMLADAV"#suffix#acc#"_noexch") + def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm", + (!cast<Instruction>("MVE_VMLADAV"#acc#suffix) tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; } } -multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28, - list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>; -} - -defm MVE_VMLSDAVs8 : MVE_VMLSDAV_multi<"s8", 0, 0b1>; -defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>; -defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>; - // Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr, bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0, @@ -742,82 +805,83 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr, let Inst{0} = bit_0; } -multiclass MVE_VMLALDAVBase_X<string iname, string suffix, dag iops, - string cstr, bit sz, bit bit_28, bit A, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - def _noexch : MVE_VMLALDAVBase<iname, suffix, iops, cstr, sz, - bit_28, A, 0b0, bit_8, bit_0, pattern>; - def _exch : MVE_VMLALDAVBase<iname # "x", suffix, iops, cstr, sz, - bit_28, A, 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix, + bit sz, bit bit_28, bit X, bit bit_8, bit bit_0, + list<dag> pattern=[]> { + def ""#x#suffix : MVE_VMLALDAVBase< + iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", + sz, bit_28, 0b0, X, bit_8, bit_0, pattern>; + def "a"#x#suffix : MVE_VMLALDAVBase< + iname # "a" # x, suffix, + (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm), + "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc", + sz, bit_28, 0b1, X, bit_8, bit_0, pattern>; } -multiclass MVE_VMLALDAVBase_XA<string iname, string suffix, bit sz, bit bit_28, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - defm _noacc : MVE_VMLALDAVBase_X< - iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", - sz, bit_28, 0b0, bit_8, bit_0, pattern>; - defm _acc : MVE_VMLALDAVBase_X< - iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, - MQPR:$Qn, MQPR:$Qm), - "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc", - sz, bit_28, 0b1, bit_8, bit_0, pattern>; + +multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28, + bit bit_8, bit bit_0, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz, + bit_28, 0b0, bit_8, bit_0, pattern>; + defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz, + bit_28, 0b1, bit_8, bit_0, pattern>; } -multiclass MVE_VRMLALDAVH_multi<string suffix, bit U, list<dag> pattern=[]> { - defm "" : MVE_VMLALDAVBase_XA< - "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>; +multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix, + 0b0, 0b0, 0b1, 0b0, pattern>; + defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix, + 0b0, 0b1, 0b0, 0b1, 0b0, pattern>; } -defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>; -defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>; +defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">; // vrmlalvh aliases for vrmlaldavh def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHs32_noacc_noexch + (MVE_VRMLALDAVHs32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHs32_acc_noexch + (MVE_VRMLALDAVHas32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHu32_noacc_noexch + (MVE_VRMLALDAVHu32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHu32_acc_noexch + (MVE_VRMLALDAVHau32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; -multiclass MVE_VMLALDAV_multi<string suffix, bit sz, bit U, - list<dag> pattern=[]> { - defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>; +multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>; + defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix, + sz, 0b1, 0b0, 0b0, 0b0, pattern>; } -defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>; -defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>; -defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>; -defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>; +defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>; +defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>; // vmlalv aliases vmlaldav -foreach acc = ["_acc", "_noacc"] in { +foreach acc = ["", "a"] in { foreach suffix = ["s16", "s32", "u16", "u32"] in { - def : MVEInstAlias<!strconcat("vmlalv", !if(!eq(acc, "_acc"), "a", ""), - "${vp}.", suffix, "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm"), - (!cast<Instruction>("MVE_VMLALDAV"#suffix#acc#"_noexch") + def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix # + "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm", + (!cast<Instruction>("MVE_VMLALDAV"#acc#suffix) tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; } } multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz, - bit bit_28, list<dag> pattern=[]> { - defm "" : MVE_VMLALDAVBase_XA<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>; + bit bit_28, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>; } -defm MVE_VMLSLDAVs16 : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>; -defm MVE_VMLSLDAVs32 : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>; -defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>; +defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>; +defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>; +defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>; // end of mve_rDest instructions @@ -967,11 +1031,12 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), let Inst{6} = 0b1; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; } -class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7> +class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr=""> : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, - suffix, "$Qd, $Qm", ""> { + suffix, "$Qd, $Qm", cstr> { let Inst{28} = 0b1; let Inst{25-23} = 0b111; @@ -985,9 +1050,9 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7> let Inst{0} = 0b0; } -def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00>; -def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>; -def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>; +def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">; +def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">; +def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">; def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>; def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>; @@ -995,6 +1060,13 @@ def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>; def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>; let Predicates = [HasMVEInt] in { + def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))), + (v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>; + def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))), + (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>; +} + +let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), @@ -1026,6 +1098,7 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), let Inst{12-6} = 0b0010111; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } let Predicates = [HasMVEInt] in { @@ -1054,6 +1127,7 @@ class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28> let Inst{6} = 0b1; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>; @@ -1145,6 +1219,7 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type> : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { let Inst{5} = 0b0; + let validForTailPredication = 1; } def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>; @@ -1173,6 +1248,7 @@ def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type> : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { let Inst{5} = 0b1; + let validForTailPredication = 1; } def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>; @@ -1315,8 +1391,12 @@ let Predicates = [HasMVEInt] in { def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; - def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane), - (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>; + def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), + (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; + def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; def : Pat<(v4f32 (scalar_to_vector SPR:$src)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; @@ -1408,6 +1488,7 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract, let Inst{12-8} = 0b01000; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]> @@ -1442,8 +1523,8 @@ let Predicates = [HasMVEInt] in { } class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract, - bits<2> size, list<dag> pattern=[]> - : MVE_int<iname, suffix, size, pattern> { + bits<2> size, ValueType vt> + : MVE_int<iname, suffix, size, []> { let Inst{28} = U; let Inst{25-23} = 0b110; @@ -1453,26 +1534,49 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract, let Inst{8} = 0b0; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; + + ValueType VT = vt; } -class MVE_VQADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]> - : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>; -class MVE_VQSUB<string suffix, bit U, bits<2> size, list<dag> pattern=[]> - : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>; +class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT> + : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>; +class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT> + : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>; + +def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>; +def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>; +def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>; +def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>; +def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>; +def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>; + +def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>; +def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>; +def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>; +def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>; +def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>; +def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>; -def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00>; -def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>; -def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>; -def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00>; -def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>; -def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in + foreach VT = [instr.VT] in + def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; + foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in + foreach VT = [instr.VT] in + def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; + foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in + foreach VT = [instr.VT] in + def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; + foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in + foreach VT = [instr.VT] in + def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; +} -def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00>; -def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>; -def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>; -def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00>; -def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>; -def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>; class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]> : MVE_int<"vabd", suffix, size, pattern> { @@ -1483,6 +1587,7 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let Inst{12-8} = 0b00111; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>; @@ -1501,6 +1606,7 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let Inst{12-8} = 0b00001; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>; @@ -1522,6 +1628,7 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, let Inst{8} = 0b0; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } class MVE_VHADD<string suffix, bit U, bits<2> size, @@ -1545,6 +1652,60 @@ def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>; def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>; def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvshrsImm + (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHADDs8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshrsImm + (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHADDs16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshrsImm + (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHADDs32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshruImm + (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHADDu8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshruImm + (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHADDu16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshruImm + (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHADDu32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshrsImm + (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHSUBs8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshrsImm + (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHSUBs16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshrsImm + (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHSUBs32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshruImm + (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHSUBu8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshruImm + (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHSUBu16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshruImm + (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHSUBu32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; +} + class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> { @@ -1563,6 +1724,7 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> let Inst{6} = 0b0; let Inst{5} = E; let Inst{4-0} = 0b10000; + let validForTailPredication = 1; } def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>; @@ -1625,6 +1787,7 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size, let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>; @@ -1635,6 +1798,15 @@ def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>; def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>; def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))), + (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>; + def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))), + (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>; + def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))), + (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>; +} + class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, list<dag> pattern=[]> : MVEIntSingleSrc<iname, suffix, size, pattern> { @@ -1648,6 +1820,7 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>; @@ -1689,6 +1862,7 @@ class MVE_VQABSNEG<string iname, string suffix, bits<2> size, let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>; @@ -1720,6 +1894,7 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op, let Inst{3-0} = imm{3-0}; let DecoderMethod = "DecodeMVEModImmInstruction"; + let validForTailPredication = 1; } let isReMaterializable = 1 in { @@ -2115,6 +2290,7 @@ class MVE_shift_by_vec<string iname, string suffix, bit U, let Inst{4} = bit_4; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let validForTailPredication = 1; } multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> { @@ -2163,6 +2339,7 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops, let Inst{4} = 0b1; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let validForTailPredication = 1; } class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm> @@ -2175,6 +2352,7 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm> let Inst{21-16} = imm; let Inst{10-9} = 0b10; let Inst{8} = bit_8; + let validForTailPredication = 1; } def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> { @@ -2427,6 +2605,7 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size, let Inst{11-10} = 0b01; let Inst{9-7} = op{2-0}; let Inst{4} = 0b0; + let validForTailPredication = 1; } @@ -2489,6 +2668,7 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]> let Inst{12-8} = 0b01101; let Inst{7} = Qn{3}; let Inst{4} = 0b1; + let validForTailPredication = 1; } def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>; @@ -2556,8 +2736,38 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; -def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; +let Predicates = [HasMVEFloat, UseFusedMAC] in { + def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1), + (fmul (v8f16 MQPR:$src2), + (v8f16 MQPR:$src3)))), + (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>; + def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1), + (fmul (v4f32 MQPR:$src2), + (v4f32 MQPR:$src3)))), + (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>; + + def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1), + (fmul (v8f16 MQPR:$src2), + (v8f16 MQPR:$src3)))), + (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>; + def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1), + (fmul (v4f32 MQPR:$src2), + (v4f32 MQPR:$src3)))), + (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>; +} + +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), + (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; + def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), + (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; +} + + +let validForTailPredication = 1 in { + def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; + def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; +} let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), @@ -2566,8 +2776,11 @@ let Predicates = [HasMVEFloat] in { (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } -def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; -def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; + +let validForTailPredication = 1 in { + def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; + def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; +} let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), @@ -2576,10 +2789,10 @@ let Predicates = [HasMVEFloat] in { (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } -class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]> +class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]> : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qd; bits<4> Qn; bit rot; @@ -2598,7 +2811,7 @@ class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]> } def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>; -def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>; +def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">; class MVE_VABD_fp<string suffix, bit size> : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), @@ -2617,6 +2830,7 @@ class MVE_VABD_fp<string suffix, bit size> let Inst{11-8} = 0b1101; let Inst{7} = Qn{3}; let Inst{4} = 0b0; + let validForTailPredication = 1; } def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>; @@ -2643,6 +2857,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op, let Inst{4} = 0b1; let DecoderMethod = "DecodeMVEVCVTt1fp"; + let validForTailPredication = 1; } class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass { @@ -2693,6 +2908,7 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm, let Inst{9-8} = rm; let Inst{7} = op; let Inst{4} = 0b0; + let validForTailPredication = 1; } multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op, @@ -2727,6 +2943,7 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op, let Inst{12-9} = 0b0011; let Inst{8-7} = op; let Inst{4} = 0b0; + let validForTailPredication = 1; } // The unsuffixed VCVT for float->int implicitly rounds toward zero, @@ -2776,6 +2993,7 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate, let Inst{11-8} = 0b0111; let Inst{7} = negate; let Inst{4} = 0b0; + let validForTailPredication = 1; } def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>; @@ -2863,6 +3081,7 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20, // decoder to emit an operand that isn't affected by any instruction // bit. let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">"; + let validForTailPredication = 1; } class MVE_VCMPqqf<string suffix, bit size> @@ -2927,6 +3146,7 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20, let Constraints = ""; // Custom decoder method, for the same reason as MVE_VCMPqq let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">"; + let validForTailPredication = 1; } class MVE_VCMPqrf<string suffix, bit size> @@ -2966,6 +3186,168 @@ def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>; def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>; def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>; +multiclass unpred_vcmp_z<string suffix, int fc> { + def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>; + def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>; + def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>; + + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; +} + +multiclass unpred_vcmp_r<string suffix, int fc> { + def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>; + def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>; + def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; + + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; + + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>; + + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; +} + +multiclass unpred_vcmpf_z<int fc> { + def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>; + def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>; + + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))), + (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; +} + +multiclass unpred_vcmpf_r<int fc> { + def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; + def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; + + def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; + def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; + + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>; + + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; +} + +let Predicates = [HasMVEInt] in { + defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>; + defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>; + defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>; + defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>; + defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>; + defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>; + defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>; + defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>; + + defm MVE_VCEQ : unpred_vcmp_r<"i", 0>; + defm MVE_VCNE : unpred_vcmp_r<"i", 1>; + defm MVE_VCGE : unpred_vcmp_r<"s", 10>; + defm MVE_VCLT : unpred_vcmp_r<"s", 11>; + defm MVE_VCGT : unpred_vcmp_r<"s", 12>; + defm MVE_VCLE : unpred_vcmp_r<"s", 13>; + defm MVE_VCGTU : unpred_vcmp_r<"u", 8>; + defm MVE_VCGEU : unpred_vcmp_r<"u", 2>; +} + +let Predicates = [HasMVEFloat] in { + defm MVE_VFCEQZ : unpred_vcmpf_z<0>; + defm MVE_VFCNEZ : unpred_vcmpf_z<1>; + defm MVE_VFCGEZ : unpred_vcmpf_z<10>; + defm MVE_VFCLTZ : unpred_vcmpf_z<11>; + defm MVE_VFCGTZ : unpred_vcmpf_z<12>; + defm MVE_VFCLEZ : unpred_vcmpf_z<13>; + + defm MVE_VFCEQ : unpred_vcmpf_r<0>; + defm MVE_VFCNE : unpred_vcmpf_r<1>; + defm MVE_VFCGE : unpred_vcmpf_r<10>; + defm MVE_VFCLT : unpred_vcmpf_r<11>; + defm MVE_VFCGT : unpred_vcmpf_r<12>; + defm MVE_VFCLE : unpred_vcmpf_r<13>; +} + + +// Extra "worst case" and/or/xor partterns, going into and out of GRP +multiclass two_predops<SDPatternOperator opnode, Instruction insn> { + def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))), + (v16i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))), + VCCR))>; + def v8i1 : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))), + (v8i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))), + VCCR))>; + def v4i1 : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))), + (v4i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))), + VCCR))>; +} + +let Predicates = [HasMVEInt] in { + defm POR : two_predops<or, t2ORRrr>; + defm PAND : two_predops<and, t2ANDrr>; + defm PEOR : two_predops<xor, t2EORrr>; +} + +// Occasionally we need to cast between a i32 and a boolean vector, for +// example when moving between rGPR and VPR.P0 as part of predicate vector +// shuffles. We also sometimes need to cast between different predicate +// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. + +def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; + +let Predicates = [HasMVEInt] in { + foreach VT = [ v4i1, v8i1, v16i1 ] in { + def : Pat<(i32 (predicate_cast (VT VCCR:$src))), + (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>; + def : Pat<(VT (predicate_cast (i32 VCCR:$src))), + (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>; + + foreach VT2 = [ v4i1, v8i1, v16i1 ] in + def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), + (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; + } +} + // end of MVE compares // start of MVE_qDest_qSrc @@ -2989,10 +3371,10 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops, } class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract, - string suffix, bits<2> size, list<dag> pattern=[]> + string suffix, bits<2> size, string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", - vpred_n, "$Qd = $Qd_src", pattern> { + vpred_n, "$Qd = $Qd_src"#cstr, pattern> { bits<4> Qn; let Inst{28} = subtract; @@ -3009,7 +3391,7 @@ multiclass MVE_VQxDMLxDH_multi<string iname, bit exch, bit round, bit subtract> { def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>; def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>; - def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10>; + def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">; } defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; @@ -3021,10 +3403,10 @@ defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>; defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>; defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>; -class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]> +class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qn; bits<2> rot; @@ -3041,13 +3423,13 @@ class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]> } def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>; -def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>; +def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">; class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, - bit T, list<dag> pattern=[]> + bit T, string cstr, list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", - vpred_r, "", pattern> { + vpred_r, cstr, pattern> { bits<4> Qd; bits<4> Qn; bits<4> Qm; @@ -3063,9 +3445,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, } multiclass MVE_VMULL_multi<string iname, string suffix, - bit bit_28, bits<2> bits_21_20> { - def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0>; - def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1>; + bit bit_28, bits<2> bits_21_20, string cstr=""> { + def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>; + def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>; } // For integer multiplies, bits 21:20 encode size, and bit 28 signedness. @@ -3074,10 +3456,10 @@ multiclass MVE_VMULL_multi<string iname, string suffix, defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>; defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>; -defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>; +defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">; defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>; defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>; -defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>; +defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">; defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>; defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>; @@ -3144,6 +3526,18 @@ defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>; defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; +def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>; +let Predicates = [HasMVEInt] in { + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; +} + class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), @@ -3166,11 +3560,10 @@ defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>; defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>; class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve, - list<dag> pattern=[]> + string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, "", - pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qn; bit rot; @@ -3186,11 +3579,11 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve, def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>; def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>; -def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>; +def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">; def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>; def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>; -def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>; +def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">; class MVE_VADCSBC<string iname, bit I, bit subtract, dag carryin, list<dag> pattern=[]> @@ -3220,10 +3613,10 @@ def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>; def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>; class MVE_VQDMULL<string iname, string suffix, bit size, bit T, - list<dag> pattern=[]> + string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", - vpred_r, "", pattern> { + vpred_r, cstr, pattern> { bits<4> Qn; let Inst{28} = size; @@ -3236,13 +3629,13 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T, let Inst{0} = 0b1; } -multiclass MVE_VQDMULL_halves<string suffix, bit size> { - def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>; - def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>; +multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> { + def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>; + def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>; } defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>; -defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>; +defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">; // end of mve_qDest_qSrc @@ -3267,9 +3660,9 @@ class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname, let Inst{3-0} = Rm{3-0}; } -class MVE_qDest_rSrc<string iname, string suffix, list<dag> pattern=[]> +class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]> : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm), - NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "", + NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr, pattern>; class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]> @@ -3291,7 +3684,7 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]> class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, bit bit_5, bit bit_12, bit bit_16, bit bit_28, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = bit_28; let Inst{21-20} = size; @@ -3299,6 +3692,7 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, let Inst{12} = bit_12; let Inst{8} = 0b1; let Inst{5} = bit_5; + let validForTailPredication = 1; } multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix, @@ -3320,9 +3714,27 @@ defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), + (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), + (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), + (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), + (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), + (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), + (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +} + class MVE_VQDMULL_qr<string iname, string suffix, bit size, - bit T, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + bit T, string cstr="", list<dag> pattern=[]> + : MVE_qDest_rSrc<iname, suffix, cstr, pattern> { let Inst{28} = size; let Inst{21-20} = 0b11; @@ -3332,18 +3744,18 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size, let Inst{5} = 0b1; } -multiclass MVE_VQDMULL_qr_halves<string suffix, bit size> { - def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>; - def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>; +multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> { + def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>; + def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>; } defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>; -defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>; +defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">; class MVE_VxADDSUB_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit subtract, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; @@ -3351,6 +3763,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix, let Inst{12} = subtract; let Inst{8} = 0b1; let Inst{5} = 0b0; + let validForTailPredication = 1; } def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; @@ -3388,6 +3801,7 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, let Inst{12-8} = 0b11110; let Inst{7} = bit_7; let Inst{6-4} = 0b110; + let validForTailPredication = 1; } multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> { @@ -3421,7 +3835,7 @@ let Predicates = [HasMVEInt] in { } class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = 0b1; let Inst{21-20} = size; @@ -3429,15 +3843,27 @@ class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]> let Inst{12} = 0b1; let Inst{8} = 0b0; let Inst{5} = 0b1; + let validForTailPredication = 1; } def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>; def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>; def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))), + (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>; + + def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))), + (v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>; + + def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))), + (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>; +} + class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = 0b0; let Inst{21-20} = size; @@ -3445,15 +3871,25 @@ class MVE_VMUL_qr_int<string iname, string suffix, let Inst{12} = 0b1; let Inst{8} = 0b0; let Inst{5} = 0b1; + let validForTailPredication = 1; } def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), + (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), + (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), + (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +} + class MVE_VxxMUL_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; @@ -3471,14 +3907,14 @@ def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; -let Predicates = [HasMVEFloat] in { +let Predicates = [HasMVEFloat], validForTailPredication = 1 in { def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; } class MVE_VFMAMLA_qr<string iname, string suffix, - bit bit_28, bits<2> bits_21_20, bit S, - list<dag> pattern=[]> + bit bit_28, bits<2> bits_21_20, bit S, + list<dag> pattern=[]> : MVE_qDestSrc_rSrc<iname, suffix, pattern> { let Inst{28} = bit_28; @@ -3487,6 +3923,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix, let Inst{12} = S; let Inst{8} = 0b0; let Inst{5} = 0b0; + let validForTailPredication = 1; } def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; @@ -3503,6 +3940,21 @@ def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v4i32 (add (v4i32 MQPR:$src1), + (v4i32 (mul (v4i32 MQPR:$src2), + (v4i32 (ARMvdup (i32 rGPR:$x))))))), + (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>; + def : Pat<(v8i16 (add (v8i16 MQPR:$src1), + (v8i16 (mul (v8i16 MQPR:$src2), + (v8i16 (ARMvdup (i32 rGPR:$x))))))), + (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>; + def : Pat<(v16i8 (add (v16i8 MQPR:$src1), + (v16i8 (mul (v16i8 MQPR:$src2), + (v16i8 (ARMvdup (i32 rGPR:$x))))))), + (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; +} + let Predicates = [HasMVEFloat] in { def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>; def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>; @@ -3555,6 +4007,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{7} = imm{1}; let Inst{6-1} = 0b110111; let Inst{0} = imm{0}; + let validForTailPredication = 1; } def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; @@ -3589,6 +4042,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{6-4} = 0b110; let Inst{3-1} = Rm{3-1}; let Inst{0} = imm{0}; + let validForTailPredication = 1; } def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>; @@ -3599,6 +4053,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>; def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; +let hasSideEffects = 1 in class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { @@ -3614,6 +4069,7 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]> let Constraints = ""; let DecoderMethod = "DecodeMveVCTP"; + let validForTailPredication = 1; } def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; @@ -3621,6 +4077,15 @@ def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; +let Predicates = [HasMVEInt] in { + def : Pat<(int_arm_vctp8 rGPR:$Rn), + (v16i1 (MVE_VCTP8 rGPR:$Rn))>; + def : Pat<(int_arm_vctp16 rGPR:$Rn), + (v8i1 (MVE_VCTP16 rGPR:$Rn))>; + def : Pat<(int_arm_vctp32 rGPR:$Rn), + (v4i1 (MVE_VCTP32 rGPR:$Rn))>; +} + // end of mve_qDest_rSrc // start of coproc mov @@ -3863,6 +4328,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc, let mayLoad = dir.load; let mayStore = !eq(dir.load,0); + let validForTailPredication = 1; } // Contiguous load and store instructions. These come in two main @@ -4165,7 +4631,8 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte let Inst{7} = fc{0}; let Inst{4} = 0b0; - let Defs = [VPR, P0]; + let Defs = [VPR]; + let validForTailPredication = 1; } class MVE_VPTt1<string suffix, bits<2> size, dag iops> @@ -4177,11 +4644,12 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops> let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = fc{1}; + let validForTailPredication = 1; } class MVE_VPTt1i<string suffix, bits<2> size> : MVE_VPTt1<suffix, size, - (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, MQPR:$Qm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_i:$fc)> { let Inst{12} = 0b0; let Inst{0} = 0b0; } @@ -4192,7 +4660,7 @@ def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>; class MVE_VPTt1u<string suffix, bits<2> size> : MVE_VPTt1<suffix, size, - (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, MQPR:$Qm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_u:$fc)> { let Inst{12} = 0b0; let Inst{0} = 0b1; } @@ -4203,7 +4671,7 @@ def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>; class MVE_VPTt1s<string suffix, bits<2> size> : MVE_VPTt1<suffix, size, - (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, MQPR:$Qm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_s:$fc)> { let Inst{12} = 0b1; } @@ -4225,7 +4693,7 @@ class MVE_VPTt2<string suffix, bits<2> size, dag iops> class MVE_VPTt2i<string suffix, bits<2> size> : MVE_VPTt2<suffix, size, - (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, GPRwithZR:$Rm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_i:$fc)> { let Inst{12} = 0b0; let Inst{5} = 0b0; } @@ -4236,7 +4704,7 @@ def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>; class MVE_VPTt2u<string suffix, bits<2> size> : MVE_VPTt2<suffix, size, - (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, GPRwithZR:$Rm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_u:$fc)> { let Inst{12} = 0b0; let Inst{5} = 0b1; } @@ -4247,7 +4715,7 @@ def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>; class MVE_VPTt2s<string suffix, bits<2> size> : MVE_VPTt2<suffix, size, - (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, GPRwithZR:$Rm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_s:$fc)> { let Inst{12} = 0b1; } @@ -4276,12 +4744,13 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern= let Inst{7} = fc{0}; let Inst{4} = 0b0; - let Defs = [P0]; + let Defs = [VPR]; let Predicates = [HasMVEFloat]; + let validForTailPredication = 1; } class MVE_VPTft1<string suffix, bit size> - : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, MQPR:$Qm), + : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_fp:$fc), "$fc, $Qn, $Qm"> { bits<3> fc; bits<4> Qm; @@ -4296,7 +4765,7 @@ def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>; def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>; class MVE_VPTft2<string suffix, bit size> - : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, GPRwithZR:$Rm), + : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_fp:$fc), "$fc, $Qn, $Rm"> { bits<3> fc; bits<4> Rm; @@ -4322,7 +4791,8 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary, let Unpredictable{7} = 0b1; let Unpredictable{5} = 0b1; - let Defs = [P0]; + let Uses = [VPR]; + let validForTailPredication = 1; } def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, @@ -4346,6 +4816,7 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; + let validForTailPredication = 1; } foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", @@ -4353,19 +4824,113 @@ foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm", (MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; -def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary, +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + + def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), + (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), + (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + + def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>; + def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + + def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), + (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), + (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + + // Pred <-> Int + def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))), + (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>; + def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))), + (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>; + def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))), + (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>; +} + +let Predicates = [HasMVEFloat] in { + // Pred <-> Float + // 112 is 1.0 in float + def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))), + (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; + // 2620 in 1.0 in half + def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))), + (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; + // 240 is -1.0 in float + def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))), + (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; + // 2748 is -1.0 in half + def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))), + (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; + + def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; + def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; + def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; + def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; +} + +def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary, "vpnot", "", "", vpred_n, "", []> { let Inst{31-0} = 0b11111110001100010000111101001101; let Unpredictable{19-17} = 0b111; let Unpredictable{12} = 0b1; let Unpredictable{7} = 0b1; let Unpredictable{5} = 0b1; - let Defs = [P0]; - let Uses = [P0]; let Constraints = ""; + let DecoderMethod = "DecodeMVEVPNOT"; } +let Predicates = [HasMVEInt] in { + def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))), + (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>; + def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))), + (v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>; + def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))), + (v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>; +} + + class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size> : t2LOL<(outs GPRlr:$LR), iops, asm, ops> { bits<4> Rn; @@ -4433,159 +4998,440 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { // Patterns //===----------------------------------------------------------------------===// -class MVE_unpred_vector_store_typed<ValueType Ty, Instruction RegImmInst, +class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>; +class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>; + +multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind, + int shift> { + def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>; +} + +class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst, + PatFrag LoadKind, int shift> + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)), + (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>; +class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst, + PatFrag LoadKind, int shift> + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))), + (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>; + +multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind, + int shift> { + def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>; +} + +class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode, PatFrag StoreKind, int shift> - : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr), - (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>; + : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr), + (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>; -multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind, +multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind, int shift> { - def : MVE_unpred_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>; } -class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst, - PatFrag LoadKind, int shift> - : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)), - (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>; +def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (pre_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (post_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (pre_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 2; +}]>; +def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (post_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 2; +}]>; -multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind, - int shift> { - def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>; -} + +def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + return Ld->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; +def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; +}]>; +def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2; +}]>; +def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; +def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; +}]>; +def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; +}]>; + +def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore8 node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; + +def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore16 node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; +}]>; let Predicates = [HasMVEInt, IsLE] in { - defm : MVE_unpred_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>; - defm : MVE_unpred_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>; - defm : MVE_unpred_vector_store<MVE_VSTRWU32, alignedstore32, 2>; + // Stores + defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>; + defm : MVE_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>; + defm : MVE_vector_store<MVE_VSTRWU32, alignedstore32, 2>; - defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>; - defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>; - defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>; + // Loads + defm : MVE_vector_load<MVE_VLDRBU8, byte_alignedload, 0>; + defm : MVE_vector_load<MVE_VLDRHU16, hword_alignedload, 1>; + defm : MVE_vector_load<MVE_VLDRWU32, alignedload32, 2>; - def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), - (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), - (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), - (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + // Pre/post inc stores + defm : MVE_vector_offset_store<MVE_VSTRBU8_pre, pre_store, 0>; + defm : MVE_vector_offset_store<MVE_VSTRBU8_post, post_store, 0>; + defm : MVE_vector_offset_store<MVE_VSTRHU16_pre, aligned16_pre_store, 1>; + defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>; + defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>; + defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>; } let Predicates = [HasMVEInt, IsBE] in { - def : MVE_unpred_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>; - def : MVE_unpred_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>; - def : MVE_unpred_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>; - def : MVE_unpred_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>; - def : MVE_unpred_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>; + // Aligned Stores + def : MVE_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>; + def : MVE_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>; + def : MVE_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>; + def : MVE_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>; + def : MVE_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>; - def : MVE_unpred_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>; - def : MVE_unpred_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>; - def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>; - def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>; - def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>; + // Aligned Loads + def : MVE_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>; + def : MVE_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>; + def : MVE_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>; + def : MVE_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>; + def : MVE_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>; + + // Other unaligned loads/stores need to go though a VREV + def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)), + (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)), + (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)), + (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)), + (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)), + (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)), + (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + + // Pre/Post inc stores + def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_pre, pre_store, 0>; + def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_post, post_store, 0>; + def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>; + def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_store, 1>; + def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>; + def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_store, 1>; + def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>; + def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>; + def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>; + def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>; } +let Predicates = [HasMVEInt] in { + // Aligned masked store, shared between LE and BE + def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>; + def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>; + def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>; + def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>; + def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>; + // Truncating stores + def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; + // Aligned masked loads + def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>; + def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>; + def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>; + def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>; + def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>; + // Extending masked loads. + def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; +} // Widening/Narrowing Loads/Stores +let MinAlignment = 2 in { + def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr), + (truncstorevi16 node:$val, node:$ptr)>; + def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), + (post_truncstvi16 node:$val, node:$base, node:$offset)>; + def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), + (pre_truncstvi16 node:$val, node:$base, node:$offset)>; +} + let Predicates = [HasMVEInt] in { - def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr), - (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>; - def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr), - (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>; - def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr), - (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>; + def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr), + (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>; + def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr), + (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>; + def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr), + (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>; + + def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), + (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; + + def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), + (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; +} + + +let MinAlignment = 2 in { + def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>; + def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>; + def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>; } multiclass MVEExtLoad<string DestLanes, string DestElemBits, string SrcElemBits, string SrcElemType, - Operand am> { + string Align, Operand am> { def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("extloadvi" # SrcElemBits) am:$addr)), + (!cast<PatFrag>("extloadvi" # SrcElemBits # Align) am:$addr)), (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("zextloadvi" # SrcElemBits) am:$addr)), + (!cast<PatFrag>("zextloadvi" # SrcElemBits # Align) am:$addr)), (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("sextloadvi" # SrcElemBits) am:$addr)), + (!cast<PatFrag>("sextloadvi" # SrcElemBits # Align) am:$addr)), (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits) am:$addr)>; } let Predicates = [HasMVEInt] in { - defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>; - defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>; - defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>; + defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>; + defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>; + defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>; } // Bit convert patterns let Predicates = [HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>; - def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>; - def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>; } let Predicates = [IsLE,HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>; +} + +let Predicates = [IsBE,HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>; - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>; - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>; - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>; - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>; - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>; - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 806681df102c..60ca92e58041 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -15,22 +15,22 @@ // NEON-specific Operands. //===----------------------------------------------------------------------===// def nModImm : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; } def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; } def nImmSplatI8 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI8AsmOperand; } def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; } def nImmSplatI16 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI16AsmOperand; } def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; } def nImmSplatI32 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI32AsmOperand; } def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; } @@ -43,7 +43,7 @@ def nImmSplatNotI32 : Operand<i32> { } def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; } def nImmVMOVI32 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVMOVI32AsmOperand; } @@ -62,18 +62,18 @@ class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To> } class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>; } class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>; } def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; } def nImmVMOVI32Neg : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVMOVI32NegAsmOperand; } def nImmVMOVF32 : Operand<i32> { @@ -82,7 +82,7 @@ def nImmVMOVF32 : Operand<i32> { } def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; } def nImmSplatI64 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI64AsmOperand; } @@ -478,20 +478,8 @@ def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr), // NEON-specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; -def SDTARMVCMPZ : SDTypeProfile<1, 1, []>; - -def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; -def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>; -def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; -def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>; -def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>; -def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; -def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; -def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>; -def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>; -def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; -def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; +def SDTARMVTST : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; +def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVTST>; // Types for vector shift by immediates. The "SHX" version is for long and // narrow operations where the source and destination vectors have different @@ -559,14 +547,14 @@ def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>; def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); return (EltBits == 32 && EltVal == 0); }]>; def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); return (EltBits == 8 && EltVal == 0xff); }]>; @@ -3326,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // source operand element sizes of 8, 16 and 32 bits: multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, string opc, string Dt, - string asm, SDNode OpNode> { + string asm, int fc> { // 64-bit vector types. def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "8"), asm, "", - [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>; + [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>; def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "16"), asm, "", - [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>; + [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>; def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "32"), asm, "", - [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>; + [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>; def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, "f32", asm, "", - [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { + [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> { let Inst{10} = 1; // overwrite F = 1 } def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, "f16", asm, "", - [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>, + [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>, Requires<[HasNEON,HasFullFP16]> { let Inst{10} = 1; // overwrite F = 1 } @@ -3358,30 +3346,83 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "8"), asm, "", - [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>; + [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>; def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "16"), asm, "", - [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>; + [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>; def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "32"), asm, "", - [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>; + [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>; def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, "f32", asm, "", - [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { + [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> { let Inst{10} = 1; // overwrite F = 1 } def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, "f16", asm, "", - [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>, + [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>, Requires<[HasNEON,HasFullFP16]> { let Inst{10} = 1; // overwrite F = 1 } } +// Neon 3-register comparisons. +class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, int fc, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, int fc, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + int fc, bit Commutable = 0> { + // 64-bit vector types. + def v8i8 : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, fc, Commutable>; + def v4i16 : N3VD_cmp<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, fc, Commutable>; + def v2i32 : N3VD_cmp<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, fc, Commutable>; + + // 128-bit vector types. + def v16i8 : N3VQ_cmp<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, fc, Commutable>; + def v8i16 : N3VQ_cmp<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, fc, Commutable>; + def v4i32 : N3VQ_cmp<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, fc, Commutable>; +} + // Neon 2-register vector intrinsics, // element sizes of 8, 16 and 32 bits: @@ -5026,67 +5067,67 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), // Vector Comparisons. // VCEQ : Vector Compare Equal -defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>; -def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, - NEONvceq, 1>; -def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, - NEONvceq, 1>; -def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, - NEONvceq, 1>, +defm VCEQ : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vceq", "i", 0, 1>; +def VCEQfd : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, + 0, 1>; +def VCEQfq : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, + 0, 1>; +def VCEQhd : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, + 0, 1>, Requires<[HasNEON, HasFullFP16]>; -def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, - NEONvceq, 1>, +def VCEQhq : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, + 0, 1>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", - "$Vd, $Vm, #0", NEONvceqz>; + "$Vd, $Vm, #0", 0>; // VCGE : Vector Compare Greater Than or Equal -defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>; -defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>; -def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, - NEONvcge, 0>; -def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, - NEONvcge, 0>; -def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, - NEONvcge, 0>, +defm VCGEs : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "s", 10, 0>; +defm VCGEu : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "u", 2, 0>; +def VCGEfd : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + 10, 0>; +def VCGEfq : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, + 10, 0>; +def VCGEhd : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, + 10, 0>, Requires<[HasNEON, HasFullFP16]>; -def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, - NEONvcge, 0>, +def VCGEhq : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, + 10, 0>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", - "$Vd, $Vm, #0", NEONvcgez>; + "$Vd, $Vm, #0", 10>; defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", - "$Vd, $Vm, #0", NEONvclez>; + "$Vd, $Vm, #0", 13>; } // VCGT : Vector Compare Greater Than -defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>; -defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>; -def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, - NEONvcgt, 0>; -def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, - NEONvcgt, 0>; -def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, - NEONvcgt, 0>, +defm VCGTs : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "s", 12, 0>; +defm VCGTu : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "u", 8, 0>; +def VCGTfd : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, + 12, 0>; +def VCGTfq : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, + 12, 0>; +def VCGThd : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, + 12, 0>, Requires<[HasNEON, HasFullFP16]>; -def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, - NEONvcgt, 0>, +def VCGThq : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, + 12, 0>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", - "$Vd, $Vm, #0", NEONvcgtz>; + "$Vd, $Vm, #0", 12>; defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", - "$Vd, $Vm, #0", NEONvcltz>; + "$Vd, $Vm, #0", 11>; } // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index cfeb13c6acb6..18bcbda44580 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -565,6 +565,13 @@ let isCall = 1, 4, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>; + + // Also used for Thumb2 + // push lr before the call + def tBL_PUSHLR : tPseudoInst<(outs), (ins GPRlr:$ra, pred:$p, thumb_bl_target:$func), + 4, IIC_Br, + []>, + Requires<[IsThumb]>, Sched<[WriteBr]>; } let isBranch = 1, isTerminator = 1, isBarrier = 1 in { @@ -592,6 +599,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in { [(ARMbrjt tGPR:$target, tjumptable:$jt)]>, Sched<[WriteBrTbl]> { let Size = 2; + let isNotDuplicable = 1; list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } } @@ -1362,6 +1370,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in { [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>, Requires<[IsThumb1Only]>, Sched<[WriteALU]>; + + def tLSLSri : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, imm0_31:$imm5), + 2, IIC_iALUr, + [(set tGPR:$Rd, CPSR, (ARMlsls tGPR:$Rn, imm0_31:$imm5))]>, + Requires<[IsThumb1Only]>, + Sched<[WriteALU]>; } @@ -1465,7 +1479,7 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them // and make use of the same compressed jump table format as Thumb-2. let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1, - isIndirectBranch = 1 in { + isIndirectBranch = 1, isNotDuplicable = 1 in { def tTBB_JT : tPseudoInst<(outs), (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 7cbfaba7a8eb..25a45b39fa0c 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -45,7 +45,8 @@ def mve_shift_imm : AsmOperandClass { let RenderMethod = "addImmOperands"; let DiagnosticString = "operand must be an immediate in the range [1,32]"; } -def long_shift : Operand<i32> { +def long_shift : Operand<i32>, + ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> { let ParserMatchClass = mve_shift_imm; let DecoderMethod = "DecodeLongShiftOperand"; } @@ -2394,6 +2395,23 @@ def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)), (t2QDSUB rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn), + (t2QADD rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn), + (t2QSUB rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), + (t2QDADD rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), + (t2QDSUB rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn), + (t2QADD8 rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn), + (t2QSUB8 rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn), + (t2QADD16 rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn), + (t2QSUB16 rGPR:$Rm, rGPR:$Rn)>; + // Signed/Unsigned add/subtract def t2SASX : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>; @@ -4085,7 +4103,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), // Pseudo isntruction that combines movs + predicated rsbmi // to implement integer ABS -let usesCustomInserter = 1, Defs = [CPSR] in { +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src), NoItinerary, []>, Requires<[IsThumb2]>; } @@ -4175,15 +4193,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> } let DecoderNamespace = "Thumb2CoProc" in { -defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; -defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; -defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; -defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; } @@ -4368,8 +4386,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]>, ComplexDeprecationPredicate<"MCR">; def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, @@ -4377,8 +4395,8 @@ def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]> { let Predicates = [IsThumb2, PreV8]; } def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm", @@ -4402,24 +4420,24 @@ def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm", (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; -def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), - (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : T2v6Pat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2), + (t2MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; -def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), - (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : T2v6Pat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2), + (t2MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; /* from ARM core register to coprocessor */ def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, - imm:$CRm)]>; + [(int_arm_mcrr timm:$cop, timm:$opc1, GPR:$Rt, GPR:$Rt2, + timm:$CRm)]>; def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, - GPR:$Rt2, imm:$CRm)]> { + [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPR:$Rt, + GPR:$Rt2, timm:$CRm)]> { let Predicates = [IsThumb2, PreV8]; } @@ -4439,8 +4457,8 @@ def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2), def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]> { let Inst{27-24} = 0b1110; bits<4> opc1; @@ -4465,8 +4483,8 @@ def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]> { let Inst{27-24} = 0b1110; bits<4> opc1; @@ -5087,6 +5105,7 @@ def t2BF_LabelPseudo : t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> { let isTerminator = 1; let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; + let hasNoSchedulingInfo = 1; } def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p), @@ -5217,11 +5236,13 @@ def t2LoopDec : t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), 4, IIC_Br, []>, Sched<[WriteBr]>; -let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in { +let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in { +// Set WhileLoopStart and LoopEnd to occupy 8 bytes because they may +// get converted into t2CMP and t2Bcc. def t2WhileLoopStart : t2PseudoInst<(outs), (ins rGPR:$elts, brtarget:$target), - 4, IIC_Br, []>, + 8, IIC_Br, []>, Sched<[WriteBr]>; def t2LoopEnd : @@ -5233,7 +5254,7 @@ def t2LoopEnd : } // end isNotDuplicable class CS<string iname, bits<4> opcode, list<dag> pattern=[]> - : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond), + : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond), AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> { bits<4> Rd; bits<4> Rm; @@ -5255,6 +5276,25 @@ def t2CSINC : CS<"csinc", 0b1001>; def t2CSINV : CS<"csinv", 0b1010>; def t2CSNEG : CS<"csneg", 0b1011>; +let Predicates = [HasV8_1MMainline] in { + def : T2Pat<(ARMcsinc GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm), + (t2CSINC GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + def : T2Pat<(ARMcsinv GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm), + (t2CSINV GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + def : T2Pat<(ARMcsneg GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm), + (t2CSNEG GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + + multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> { + def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm), + (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + def : T2Pat<(ARMcmov GPRwithZR:$tval, modvalue, cmovpred:$imm), + (Insn GPRwithZR:$tval, GPRwithZR:$fval, + (i32 (inv_cond_XFORM imm:$imm)))>; + } + defm : ModifiedV8_1CSEL<t2CSINC, (add rGPR:$fval, 1)>; + defm : ModifiedV8_1CSEL<t2CSINV, (xor rGPR:$fval, -1)>; + defm : ModifiedV8_1CSEL<t2CSNEG, (sub 0, rGPR:$fval)>; +} // CS aliases. let Predicates = [HasV8_1MMainline] in { diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index a0dd25de07ee..fdd961bfbb2f 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -def SDT_CMPFP0 : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>; +def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>; def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>; def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, @@ -19,7 +19,7 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>; def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>; -def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMFCmp, [SDNPOutGlue]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; def arm_fmrrd : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>; @@ -324,7 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r", // However, there is no UAL syntax for them, so we keep them around for // (dis)assembly only. multiclass vfp_ldstx_mult<string asm, bit L_bit> { - let Predicates = [HasFPRegs] in { + let Predicates = [HasFPRegs], hasNoSchedulingInfo = 1 in { // Unknown precision def XIA : AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), @@ -548,12 +548,12 @@ let Defs = [FPSCR_NZCV] in { def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$Dd, DPR:$Dm), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", - [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>; + [/* For disassembly only; pattern left blank */]>; def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins SPR:$Sd, SPR:$Sm), IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", - [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -562,17 +562,17 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm", - [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>; + [/* For disassembly only; pattern left blank */]>; def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$Dd, DPR:$Dm), IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", - [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>; + [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>; def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins SPR:$Sd, SPR:$Sm), IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", - [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> { + [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -581,7 +581,7 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm", - [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>; + [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>; } // Defs = [FPSCR_NZCV] //===----------------------------------------------------------------------===// @@ -611,7 +611,7 @@ let Defs = [FPSCR_NZCV] in { def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$Dd), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", - [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -619,7 +619,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins SPR:$Sd), IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", - [(arm_cmpfp0 SPR:$Sd, (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; @@ -631,7 +631,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins HPR:$Sd), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0", - [(arm_cmpfp0 HPR:$Sd, (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -639,7 +639,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins DPR:$Dd), IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", - [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> { + [(arm_cmpfp0 (f64 DPR:$Dd))]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -647,7 +647,7 @@ def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins SPR:$Sd), IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", - [(arm_cmpfp0 SPR:$Sd, (i32 0))]> { + [(arm_cmpfp0 SPR:$Sd)]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; @@ -659,7 +659,7 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins HPR:$Sd), IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0", - [(arm_cmpfp0 HPR:$Sd, (i32 0))]> { + [(arm_cmpfp0 HPR:$Sd)]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -1732,7 +1732,8 @@ def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0, def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), - IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1740,7 +1741,8 @@ def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), - IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1748,7 +1750,8 @@ def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), - IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -2297,6 +2300,8 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, let Inst{6-5} = 0b00; let Inst{4} = 1; let Inst{3-0} = 0b0000; + let Unpredictable{7-5} = 0b111; + let Unpredictable{3-0} = 0b1111; } let DecoderMethod = "DecodeForVMRSandVMSR" in { @@ -2370,63 +2375,65 @@ class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { // Instruction operand. - bits<4> src; - - // Encode instruction operand. - let Inst{15-12} = src; + bits<4> Rt; let Inst{27-20} = 0b11101110; let Inst{19-16} = opc19_16; + let Inst{15-12} = Rt; let Inst{11-8} = 0b1010; let Inst{7} = 0; + let Inst{6-5} = 0b00; let Inst{4} = 1; + let Inst{3-0} = 0b0000; let Predicates = [HasVFP2]; + let Unpredictable{7-5} = 0b111; + let Unpredictable{3-0} = 0b1111; } let DecoderMethod = "DecodeForVMRSandVMSR" in { let Defs = [FPSCR] in { let Predicates = [HasFPRegs] in // Application level GPR -> FPSCR - def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpscr, $src", - [(int_arm_set_fpscr GPRnopc:$src)]>; + def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpscr, $Rt", + [(int_arm_set_fpscr GPRnopc:$Rt)]>; // System level GPR -> FPEXC - def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpexc, $src", []>; + def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpexc, $Rt", []>; // System level GPR -> FPSID - def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpsid, $src", []>; - def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpinst, $src", []>; - def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpinst2, $src", []>; + def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpsid, $Rt", []>; + def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpinst, $Rt", []>; + def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpinst2, $Rt", []>; } let Predicates = [HasV8_1MMainline, Has8MSecExt] in { // System level GPR -> FPSCR with context saving for security extensions - def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src), - "vmsr", "\tfpcxtns, $src", []>; + def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$Rt), + "vmsr", "\tfpcxtns, $Rt", []>; } let Predicates = [HasV8_1MMainline, Has8MSecExt] in { // System level GPR -> FPSCR with context saving for security extensions - def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src), - "vmsr", "\tfpcxts, $src", []>; + def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$Rt), + "vmsr", "\tfpcxts, $Rt", []>; } let Predicates = [HasV8_1MMainline, HasFPRegs] in { // System level GPR -> FPSCR_NZCVQC def VMSR_FPSCR_NZCVQC : MovToVFP<0b0010 /* fpscr_nzcvqc */, - (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src), - "vmsr", "\tfpscr_nzcvqc, $src", []>; + (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$Rt), + "vmsr", "\tfpscr_nzcvqc, $Rt", []>; } let Predicates = [HasV8_1MMainline, HasMVEInt] in { // System level GPR -> VPR/P0 let Defs = [VPR] in - def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src), - "vmsr", "\tvpr, $src", []>; + def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$Rt), + "vmsr", "\tvpr, $Rt", []>; - def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src), - "vmsr", "\tp0, $src", []>; + def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$Rt), + "vmsr", "\tp0, $Rt", []>; } } @@ -2614,7 +2621,8 @@ def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_ let Inst{21-16} = 0b011111; let Inst{15-12} = regs{11-8}; let Inst{11-8} = 0b1011; - let Inst{7-0} = regs{7-0}; + let Inst{7-1} = regs{7-1}; + let Inst{0} = 0; let DecoderMethod = "DecodeVSCCLRM"; diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 4485a474a6df..8e5e474c0f59 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -34,7 +34,7 @@ public: ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } private: @@ -210,8 +210,8 @@ static const TargetRegisterClass *guessRegClass(unsigned Reg, static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + Register DstReg = I.getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg)) return true; const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI); @@ -236,17 +236,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB, // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs // into one DPR. - unsigned VReg0 = MIB->getOperand(0).getReg(); + Register VReg0 = MIB->getOperand(0).getReg(); (void)VReg0; assert(MRI.getType(VReg0).getSizeInBits() == 64 && RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID && "Unsupported operand for G_MERGE_VALUES"); - unsigned VReg1 = MIB->getOperand(1).getReg(); + Register VReg1 = MIB->getOperand(1).getReg(); (void)VReg1; assert(MRI.getType(VReg1).getSizeInBits() == 32 && RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_MERGE_VALUES"); - unsigned VReg2 = MIB->getOperand(2).getReg(); + Register VReg2 = MIB->getOperand(2).getReg(); (void)VReg2; assert(MRI.getType(VReg2).getSizeInBits() == 32 && RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID && @@ -268,17 +268,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB, // We only support G_UNMERGE_VALUES as a way to break up one DPR into two // GPRs. - unsigned VReg0 = MIB->getOperand(0).getReg(); + Register VReg0 = MIB->getOperand(0).getReg(); (void)VReg0; assert(MRI.getType(VReg0).getSizeInBits() == 32 && RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_UNMERGE_VALUES"); - unsigned VReg1 = MIB->getOperand(1).getReg(); + Register VReg1 = MIB->getOperand(1).getReg(); (void)VReg1; assert(MRI.getType(VReg1).getSizeInBits() == 32 && RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_UNMERGE_VALUES"); - unsigned VReg2 = MIB->getOperand(2).getReg(); + Register VReg2 = MIB->getOperand(2).getReg(); (void)VReg2; assert(MRI.getType(VReg2).getSizeInBits() == 64 && RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID && @@ -833,8 +833,7 @@ void ARMInstructionSelector::renderVFPF64Imm( NewInstBuilder.addImm(FPImmEncoding); } -bool ARMInstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool ARMInstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -851,7 +850,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, using namespace TargetOpcode; - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; MachineInstrBuilder MIB{MF, I}; @@ -874,10 +873,10 @@ bool ARMInstructionSelector::select(MachineInstr &I, MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp()); if (isSExt) { - unsigned SExtResult = I.getOperand(0).getReg(); + Register SExtResult = I.getOperand(0).getReg(); // Use a new virtual register for the result of the AND - unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass); I.getOperand(0).setReg(AndResult); auto InsertBefore = std::next(I.getIterator()); @@ -928,7 +927,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size"); assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size"); - unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); auto InsertBefore = std::next(I.getIterator()); auto MovI = BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD)) @@ -1039,7 +1038,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_FCMP: { assert(STI.hasVFP2Base() && "Can't select fcmp without VFP"); - unsigned OpReg = I.getOperand(2).getReg(); + Register OpReg = I.getOperand(2).getReg(); unsigned Size = MRI.getType(OpReg).getSizeInBits(); if (Size == 64 && !STI.hasFP64()) { @@ -1077,12 +1076,12 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_STORE: case G_LOAD: { const auto &MemOp = **I.memoperands_begin(); - if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + if (MemOp.isAtomic()) { LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); return false; } - unsigned Reg = I.getOperand(0).getReg(); + Register Reg = I.getOperand(0).getReg(); unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID(); LLT ValTy = MRI.getType(Reg); @@ -1097,9 +1096,9 @@ bool ARMInstructionSelector::select(MachineInstr &I, if (ValSize == 1 && NewOpc == Opcodes.STORE8) { // Before storing a 1-bit value, make sure to clear out any unneeded bits. - unsigned OriginalValue = I.getOperand(0).getReg(); + Register OriginalValue = I.getOperand(0).getReg(); - unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass); I.getOperand(0).setReg(ValueToStore); auto InsertBefore = I.getIterator(); @@ -1159,7 +1158,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_PHI: { I.setDesc(TII.get(PHI)); - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI); if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { break; diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 73a57b297ad6..81414e6d76fe 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -84,6 +84,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) .legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16}); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR}) .legalFor({s32}) .minScalar(0, s32); diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 90a1ce238c3f..4a193fed04a3 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -509,7 +509,7 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, Offset = MO.getImm() - WordOffset * getImmScale(Opc); // If storing the base register, it needs to be reset first. - unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); + Register InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) MO.setImm(Offset); @@ -859,7 +859,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { // Determine list of registers and list of implicit super-register defs. for (const MachineInstr *MI : Cand.Instrs) { const MachineOperand &MO = getLoadStoreRegOp(*MI); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); bool IsKill = MO.isKill(); if (IsKill) KilledRegs.insert(Reg); @@ -874,7 +874,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { if (!MO.isReg() || !MO.isDef() || MO.isDead()) continue; assert(MO.isImplicit()); - unsigned DefReg = MO.getReg(); + Register DefReg = MO.getReg(); if (is_contained(ImpDefs, DefReg)) continue; @@ -893,7 +893,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { iterator InsertBefore = std::next(iterator(LatestMI)); MachineBasicBlock &MBB = *LatestMI->getParent(); unsigned Offset = getMemoryOpOffset(*First); - unsigned Base = getLoadStoreBaseOp(*First).getReg(); + Register Base = getLoadStoreBaseOp(*First).getReg(); bool BaseKill = LatestMI->killsRegister(Base); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg); @@ -1005,7 +1005,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { const MachineInstr *MI = MemOps[SIndex].MI; int Offset = MemOps[SIndex].Offset; const MachineOperand &PMO = getLoadStoreRegOp(*MI); - unsigned PReg = PMO.getReg(); + Register PReg = PMO.getReg(); unsigned PRegNum = PMO.isUndef() ? std::numeric_limits<unsigned>::max() : TRI->getEncodingValue(PReg); unsigned Latest = SIndex; @@ -1052,7 +1052,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { if (NewOffset != Offset + (int)Size) break; const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == ARM::SP || Reg == ARM::PC) break; if (Count == Limit) @@ -1261,7 +1261,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { if (isThumb1) return false; const MachineOperand &BaseOP = MI->getOperand(0); - unsigned Base = BaseOP.getReg(); + Register Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); @@ -1387,7 +1387,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // FIXME: Use LDM/STM with single register instead. if (isThumb1) return false; - unsigned Base = getLoadStoreBaseOp(*MI).getReg(); + Register Base = getLoadStoreBaseOp(*MI).getReg(); bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); @@ -1512,7 +1512,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { // Behaviour for writeback is undefined if base register is the same as one // of the others. const MachineOperand &BaseOp = MI.getOperand(2); - unsigned Base = BaseOp.getReg(); + Register Base = BaseOp.getReg(); const MachineOperand &Reg0Op = MI.getOperand(0); const MachineOperand &Reg1Op = MI.getOperand(1); if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) @@ -1655,9 +1655,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, return false; const MachineOperand &BaseOp = MI->getOperand(2); - unsigned BaseReg = BaseOp.getReg(); - unsigned EvenReg = MI->getOperand(0).getReg(); - unsigned OddReg = MI->getOperand(1).getReg(); + Register BaseReg = BaseOp.getReg(); + Register EvenReg = MI->getOperand(0).getReg(); + Register OddReg = MI->getOperand(1).getReg(); unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); @@ -1783,8 +1783,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { if (isMemoryOp(*MBBI)) { unsigned Opcode = MBBI->getOpcode(); const MachineOperand &MO = MBBI->getOperand(0); - unsigned Reg = MO.getReg(); - unsigned Base = getLoadStoreBaseOp(*MBBI).getReg(); + Register Reg = MO.getReg(); + Register Base = getLoadStoreBaseOp(*MBBI).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg); int Offset = getMemoryOpOffset(*MBBI); @@ -2121,7 +2121,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, MachineOperand &MO = I->getOperand(j); if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MO.isDef() && TRI->regsOverlap(Reg, Base)) return false; if (Reg != Base && !MemRegs.count(Reg)) @@ -2415,7 +2415,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { int Opc = MI.getOpcode(); bool isLd = isLoadSingle(Opc); - unsigned Base = MI.getOperand(1).getReg(); + Register Base = MI.getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); bool StopHere = false; auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) { diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp index cedf3bd3c74e..e1c5a9c3e223 100644 --- a/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -11,8 +11,7 @@ /// The expectation is that the loop contains three pseudo instructions: /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop /// form should be in the preheader, whereas the while form should be in the -/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the -/// pre-preheader? +/// preheaders only predecessor. /// - t2LoopDec - placed within in the loop body. /// - t2LoopEnd - the loop latch terminator. /// @@ -35,6 +34,7 @@ using namespace llvm; namespace { class ARMLowOverheadLoops : public MachineFunctionPass { + MachineFunction *MF = nullptr; const ARMBaseInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; @@ -52,17 +52,6 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; - bool ProcessLoop(MachineLoop *ML); - - void RevertWhile(MachineInstr *MI) const; - - void RevertLoopDec(MachineInstr *MI) const; - - void RevertLoopEnd(MachineInstr *MI) const; - - void Expand(MachineLoop *ML, MachineInstr *Start, - MachineInstr *Dec, MachineInstr *End, bool Revert); - MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -71,36 +60,156 @@ namespace { StringRef getPassName() const override { return ARM_LOW_OVERHEAD_LOOPS_NAME; } + + private: + bool ProcessLoop(MachineLoop *ML); + + MachineInstr * IsSafeToDefineLR(MachineInstr *MI); + + bool RevertNonLoops(); + + void RevertWhile(MachineInstr *MI) const; + + bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const; + + void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; + + void Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *InsertPt, MachineInstr *Dec, + MachineInstr *End, bool Revert); + }; } - + char ARMLowOverheadLoops::ID = 0; INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) -bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { - if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB()) +bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { + const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); + if (!ST.hasLOB()) return false; - LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); + MF = &mf; + LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); auto &MLI = getAnalysis<MachineLoopInfo>(); - MRI = &MF.getRegInfo(); - TII = static_cast<const ARMBaseInstrInfo*>( - MF.getSubtarget().getInstrInfo()); - BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); + MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); + MRI = &MF->getRegInfo(); + TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); + BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); BBUtils->computeAllBlockSizes(); - BBUtils->adjustBBOffsetsAfter(&MF.front()); + BBUtils->adjustBBOffsetsAfter(&MF->front()); bool Changed = false; for (auto ML : MLI) { if (!ML->getParentLoop()) Changed |= ProcessLoop(ML); } + Changed |= RevertNonLoops(); return Changed; } +static bool IsLoopStart(MachineInstr &MI) { + return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStart; +} + +template<typename T> +static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) { + for(auto &MI : make_range(T(Begin), End)) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg) + continue; + return &MI; + } + } + return nullptr; +} + +static MachineInstr* SearchForUse(MachineInstr *Begin, + MachineBasicBlock::iterator End, + unsigned Reg) { + for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) + continue; + return &MI; + } + } + return nullptr; +} + +// Is it safe to define LR with DLS/WLS? +// LR can defined if it is the operand to start, because it's the same value, +// or if it's going to be equivalent to the operand to Start. +MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) { + + auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) { + return MI->getOpcode() == ARM::tMOVr && + MI->getOperand(0).getReg() == ARM::LR && + MI->getOperand(1).getReg() == Reg && + MI->getOperand(2).getImm() == ARMCC::AL; + }; + + MachineBasicBlock *MBB = Start->getParent(); + unsigned CountReg = Start->getOperand(0).getReg(); + // Walk forward and backward in the block to find the closest instructions + // that define LR. Then also filter them out if they're not a mov lr. + MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR); + if (PredLRDef && !IsMoveLR(PredLRDef, CountReg)) + PredLRDef = nullptr; + + MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR); + if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg)) + SuccLRDef = nullptr; + + // We've either found one, two or none mov lr instructions... Now figure out + // if they are performing the equilvant mov that the Start instruction will. + // Do this by scanning forward and backward to see if there's a def of the + // register holding the count value. If we find a suitable def, return it as + // the insert point. Later, if InsertPt != Start, then we can remove the + // redundant instruction. + if (SuccLRDef) { + MachineBasicBlock::iterator End(SuccLRDef); + if (!SearchForDef(Start, End, CountReg)) { + return SuccLRDef; + } else + SuccLRDef = nullptr; + } + if (PredLRDef) { + MachineBasicBlock::reverse_iterator End(PredLRDef); + if (!SearchForDef(Start, End, CountReg)) { + return PredLRDef; + } else + PredLRDef = nullptr; + } + + // We can define LR because LR already contains the same value. + if (Start->getOperand(0).getReg() == ARM::LR) + return Start; + + // We've found no suitable LR def and Start doesn't use LR directly. Can we + // just define LR anyway? + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + LivePhysRegs LiveRegs(*TRI); + LiveRegs.addLiveOuts(*MBB); + + // Not if we've haven't found a suitable mov and LR is live out. + if (LiveRegs.contains(ARM::LR)) + return nullptr; + + // If LR is not live out, we can insert the instruction if nothing else + // uses LR after it. + if (!SearchForUse(Start, MBB->end(), ARM::LR)) + return Start; + + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for" + << " LR\n"); + return nullptr; +} + bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { bool Changed = false; @@ -111,15 +220,10 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); - auto IsLoopStart = [](MachineInstr &MI) { - return MI.getOpcode() == ARM::t2DoLoopStart || - MI.getOpcode() == ARM::t2WhileLoopStart; - }; - // Search the given block for a loop start instruction. If one isn't found, // and there's only one predecessor block, search that one too. std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = - [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { + [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { for (auto &MI : *MBB) { if (IsLoopStart(MI)) return &MI; @@ -165,41 +269,62 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) End = &MI; - else if (MI.getDesc().isCall()) + else if (IsLoopStart(MI)) + Start = &MI; + else if (MI.getDesc().isCall()) { // TODO: Though the call will require LE to execute again, does this // mean we should revert? Always executing LE hopefully should be // faster than performing a sub,cmp,br or even subs,br. Revert = true; + LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); + } - if (!Dec) + if (!Dec || End) continue; - // If we find that we load/store LR between LoopDec and LoopEnd, expect - // that the decremented value has been spilled to the stack. Because - // this value isn't actually going to be produced until the latch, by LE, - // we would need to generate a real sub. The value is also likely to be - // reloaded for use of LoopEnd - in which in case we'd need to perform - // an add because it gets negated again by LE! The other option is to - // then generate the other form of LE which doesn't perform the sub. - if (MI.mayLoad() || MI.mayStore()) - Revert = - MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; + // If we find that LR has been written or read between LoopDec and + // LoopEnd, expect that the decremented value is being used else where. + // Because this value isn't actually going to be produced until the + // latch, by LE, we would need to generate a real sub. The value is also + // likely to be copied/reloaded for use of LoopEnd - in which in case + // we'd need to perform an add because it gets subtracted again by LE! + // The other option is to then generate the other form of LE which doesn't + // perform the sub. + for (auto &MO : MI.operands()) { + if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && + MO.getReg() == ARM::LR) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); + Revert = true; + break; + } + } } if (Dec && End && Revert) break; } + LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; + if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; + if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;); + if (!Start && !Dec && !End) { LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); return Changed; - } if (!(Start && Dec && End)) { - report_fatal_error("Failed to find all loop components"); + } else if (!(Start && Dec && End)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n"); + return false; } - if (!End->getOperand(1).isMBB() || - End->getOperand(1).getMBB() != ML->getHeader()) - report_fatal_error("Expected LoopEnd to target Loop Header"); + if (!End->getOperand(1).isMBB()) + report_fatal_error("Expected LoopEnd to target basic block"); + + // TODO Maybe there's cases where the target doesn't have to be the header, + // but for now be safe and revert. + if (End->getOperand(1).getMBB() != ML->getHeader()) { + LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); + Revert = true; + } // The WLS and LE instructions have 12-bits for the label offset. WLS // requires a positive offset, while LE uses negative. @@ -216,41 +341,57 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { Revert = true; } - LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start - << " - Found Loop Dec: " << *Dec - << " - Found Loop End: " << *End); + MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start); + if (!InsertPt) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); + Revert = true; + } else + LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); - Expand(ML, Start, Dec, End, Revert); + Expand(ML, Start, InsertPt, Dec, End, Revert); return true; } // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a // beq that branches to the exit branch. -// FIXME: Need to check that we're not trashing the CPSR when generating the -// cmp. We could also try to generate a cbz if the value in LR is also in +// TODO: We could also try to generate a cbz if the value in LR is also in // another low register. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); MachineBasicBlock *MBB = MI->getParent(); MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri)); - MIB.addReg(ARM::LR); + MIB.add(MI->getOperand(0)); MIB.addImm(0); MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::CPSR); + MIB.addReg(ARM::NoRegister); + + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); + unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? + ARM::tBcc : ARM::t2Bcc; - // TODO: Try to use tBcc instead - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); MIB.add(MI->getOperand(1)); // branch target MIB.addImm(ARMCC::EQ); // condition code MIB.addReg(ARM::CPSR); MI->eraseFromParent(); } -// TODO: Check flags so that we can possibly generate a tSubs or tSub. -void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { +bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, + bool AllowFlags) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); MachineBasicBlock *MBB = MI->getParent(); + + // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS. + bool SetFlags = false; + if (AllowFlags) { + if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) { + if (!SearchForUse(MI, MBB->end(), ARM::CPSR) && + Def->getOpcode() == ARM::t2LoopEnd) + SetFlags = true; + } + } + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); MIB.addDef(ARM::LR); @@ -258,28 +399,39 @@ void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { MIB.add(MI->getOperand(2)); MIB.addImm(ARMCC::AL); MIB.addReg(0); - MIB.addReg(0); + + if (SetFlags) { + MIB.addReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + } else + MIB.addReg(0); + MI->eraseFromParent(); + return SetFlags; } // Generate a subs, or sub and cmp, and a branch instead of an LE. -// FIXME: Need to check that we're not trashing the CPSR when generating -// the cmp. -void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { +void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); - // Create cmp MachineBasicBlock *MBB = MI->getParent(); - MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(ARM::t2CMPri)); - MIB.addReg(ARM::LR); - MIB.addImm(0); - MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::CPSR); + // Create cmp + if (!SkipCmp) { + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2CMPri)); + MIB.addReg(ARM::LR); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::NoRegister); + } + + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); + unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? + ARM::tBcc : ARM::t2Bcc; - // TODO Try to use tBcc instead. // Create bne - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); MIB.add(MI->getOperand(1)); // branch target MIB.addImm(ARMCC::NE); // condition code MIB.addReg(ARM::CPSR); @@ -287,33 +439,13 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { } void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *InsertPt, MachineInstr *Dec, MachineInstr *End, bool Revert) { - auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { - // The trip count should already been held in LR since the instructions - // within the loop can only read and write to LR. So, there should be a - // mov to setup the count. WLS/DLS perform this move, so find the original - // and delete it - inserting WLS/DLS in its place. - MachineBasicBlock *MBB = Start->getParent(); - MachineInstr *InsertPt = Start; - for (auto &I : MRI->def_instructions(ARM::LR)) { - if (I.getParent() != MBB) - continue; - - // Always execute. - if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) - continue; - - // Only handle move reg, if the trip count it will need moving into a reg - // before the setup instruction anyway. - if (!I.getDesc().isMoveReg() || - !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) - continue; - InsertPt = &I; - break; - } - + auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start, + MachineInstr *InsertPt) { + MachineBasicBlock *MBB = InsertPt->getParent(); unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? ARM::t2DLS : ARM::t2WLS; MachineInstrBuilder MIB = @@ -369,16 +501,54 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, RevertWhile(Start); else Start->eraseFromParent(); - RevertLoopDec(Dec); - RevertLoopEnd(End); + bool FlagsAlreadySet = RevertLoopDec(Dec, true); + RevertLoopEnd(End, FlagsAlreadySet); } else { - Start = ExpandLoopStart(ML, Start); + Start = ExpandLoopStart(ML, Start, InsertPt); RemoveDeadBranch(Start); End = ExpandLoopEnd(ML, Dec, End); RemoveDeadBranch(End); } } +bool ARMLowOverheadLoops::RevertNonLoops() { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n"); + bool Changed = false; + + for (auto &MBB : *MF) { + SmallVector<MachineInstr*, 4> Starts; + SmallVector<MachineInstr*, 4> Decs; + SmallVector<MachineInstr*, 4> Ends; + + for (auto &I : MBB) { + if (IsLoopStart(I)) + Starts.push_back(&I); + else if (I.getOpcode() == ARM::t2LoopDec) + Decs.push_back(&I); + else if (I.getOpcode() == ARM::t2LoopEnd) + Ends.push_back(&I); + } + + if (Starts.empty() && Decs.empty() && Ends.empty()) + continue; + + Changed = true; + + for (auto *Start : Starts) { + if (Start->getOpcode() == ARM::t2WhileLoopStart) + RevertWhile(Start); + else + Start->eraseFromParent(); + } + for (auto *Dec : Decs) + RevertLoopDec(Dec); + + for (auto *End : Ends) + RevertLoopEnd(End); + } + return Changed; +} + FunctionPass *llvm::createARMLowOverheadLoopsPass() { return new ARMLowOverheadLoops(); } diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 90c5ad025e56..c92689f4942e 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -74,8 +74,8 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO, switch (MO.getType()) { default: llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: - // Ignore all non-CPSR implicit register operands. - if (MO.isImplicit() && MO.getReg() != ARM::CPSR) + // Ignore all implicit register operands. + if (MO.isImplicit()) return false; assert(!MO.getSubReg() && "Subregs should be eliminated!"); MCOp = MCOperand::createReg(MO.getReg()); diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index 90d794cd27b1..bb136e92329b 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/Support/ErrorHandling.h" #include <utility> @@ -130,6 +131,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// The amount the literal pool has been increasedby due to promoted globals. int PromotedGlobalsIncrease = 0; + /// True if r0 will be preserved by a call to this function (e.g. C++ + /// con/destructors). + bool PreservesR0 = false; + public: ARMFunctionInfo() = default; @@ -247,6 +252,9 @@ public: } DenseMap<unsigned, unsigned> EHPrologueRemappedRegs; + + void setPreservesR0() { PreservesR0 = true; } + bool getPreservesR0() const { return PreservesR0; } }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp index 5389d09bf7d7..ae5657a0a2c1 100644 --- a/lib/Target/ARM/ARMParallelDSP.cpp +++ b/lib/Target/ARM/ARMParallelDSP.cpp @@ -1,4 +1,4 @@ -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,13 +18,11 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -45,54 +43,39 @@ static cl::opt<bool> DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass")); +static cl::opt<unsigned> +NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), + cl::desc("Limit the number of loads analysed")); + namespace { - struct OpChain; - struct BinOpChain; + struct MulCandidate; class Reduction; - using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>; - using ReductionList = SmallVector<Reduction, 8>; - using ValueList = SmallVector<Value*, 8>; - using MemInstList = SmallVector<LoadInst*, 8>; - using PMACPair = std::pair<BinOpChain*,BinOpChain*>; - using PMACPairList = SmallVector<PMACPair, 8>; - using Instructions = SmallVector<Instruction*,16>; - using MemLocList = SmallVector<MemoryLocation, 4>; + using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>; + using MemInstList = SmallVectorImpl<LoadInst*>; + using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>; - struct OpChain { + // 'MulCandidate' holds the multiplication instructions that are candidates + // for parallel execution. + struct MulCandidate { Instruction *Root; - ValueList AllValues; - MemInstList VecLd; // List of all load instructions. - MemInstList Loads; + Value* LHS; + Value* RHS; + bool Exchange = false; bool ReadOnly = true; + bool Paired = false; + SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen. - OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { } - virtual ~OpChain() = default; + MulCandidate(Instruction *I, Value *lhs, Value *rhs) : + Root(I), LHS(lhs), RHS(rhs) { } - void PopulateLoads() { - for (auto *V : AllValues) { - if (auto *Ld = dyn_cast<LoadInst>(V)) - Loads.push_back(Ld); - } + bool HasTwoLoadInputs() const { + return isa<LoadInst>(LHS) && isa<LoadInst>(RHS); } - unsigned size() const { return AllValues.size(); } - }; - - // 'BinOpChain' holds the multiplication instructions that are candidates - // for parallel execution. - struct BinOpChain : public OpChain { - ValueList LHS; // List of all (narrow) left hand operands. - ValueList RHS; // List of all (narrow) right hand operands. - bool Exchange = false; - - BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) : - OpChain(I, lhs), LHS(lhs), RHS(rhs) { - for (auto *V : RHS) - AllValues.push_back(V); - } - - bool AreSymmetrical(BinOpChain *Other); + LoadInst *getBaseLoad() const { + return VecLd.front(); + } }; /// Represent a sequence of multiply-accumulate operations with the aim to @@ -100,9 +83,9 @@ namespace { class Reduction { Instruction *Root = nullptr; Value *Acc = nullptr; - OpChainList Muls; - PMACPairList MulPairs; - SmallPtrSet<Instruction*, 4> Adds; + MulCandList Muls; + MulPairList MulPairs; + SetVector<Instruction*> Adds; public: Reduction() = delete; @@ -112,10 +95,35 @@ namespace { /// Record an Add instruction that is a part of the this reduction. void InsertAdd(Instruction *I) { Adds.insert(I); } - /// Record a BinOpChain, rooted at a Mul instruction, that is a part of - /// this reduction. - void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) { - Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS)); + /// Create MulCandidates, each rooted at a Mul instruction, that is a part + /// of this reduction. + void InsertMuls() { + auto GetMulOperand = [](Value *V) -> Instruction* { + if (auto *SExt = dyn_cast<SExtInst>(V)) { + if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0))) + if (I->getOpcode() == Instruction::Mul) + return I; + } else if (auto *I = dyn_cast<Instruction>(V)) { + if (I->getOpcode() == Instruction::Mul) + return I; + } + return nullptr; + }; + + auto InsertMul = [this](Instruction *I) { + Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0); + Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0); + Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS)); + }; + + for (auto *Add : Adds) { + if (Add == Acc) + continue; + if (auto *Mul = GetMulOperand(Add->getOperand(0))) + InsertMul(Mul); + if (auto *Mul = GetMulOperand(Add->getOperand(1))) + InsertMul(Mul); + } } /// Add the incoming accumulator value, returns true if a value had not @@ -128,9 +136,17 @@ namespace { return true; } - /// Set two BinOpChains, rooted at muls, that can be executed as a single + /// Set two MulCandidates, rooted at muls, that can be executed as a single /// parallel operation. - void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) { + void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1, + bool Exchange = false) { + LLVM_DEBUG(dbgs() << "Pairing:\n" + << *Mul0->Root << "\n" + << *Mul1->Root << "\n"); + Mul0->Paired = true; + Mul1->Paired = true; + if (Exchange) + Mul1->Exchange = true; MulPairs.push_back(std::make_pair(Mul0, Mul1)); } @@ -141,24 +157,40 @@ namespace { /// Return the add instruction which is the root of the reduction. Instruction *getRoot() { return Root; } + bool is64Bit() const { return Root->getType()->isIntegerTy(64); } + + Type *getType() const { return Root->getType(); } + /// Return the incoming value to be accumulated. This maybe null. Value *getAccumulator() { return Acc; } /// Return the set of adds that comprise the reduction. - SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; } + SetVector<Instruction*> &getAdds() { return Adds; } - /// Return the BinOpChain, rooted at mul instruction, that comprise the + /// Return the MulCandidate, rooted at mul instruction, that comprise the /// the reduction. - OpChainList &getMuls() { return Muls; } + MulCandList &getMuls() { return Muls; } - /// Return the BinOpChain, rooted at mul instructions, that have been + /// Return the MulCandidate, rooted at mul instructions, that have been /// paired for parallel execution. - PMACPairList &getMulPairs() { return MulPairs; } + MulPairList &getMulPairs() { return MulPairs; } /// To finalise, replace the uses of the root with the intrinsic call. void UpdateRoot(Instruction *SMLAD) { Root->replaceAllUsesWith(SMLAD); } + + void dump() { + LLVM_DEBUG(dbgs() << "Reduction:\n"; + for (auto *Add : Adds) + LLVM_DEBUG(dbgs() << *Add << "\n"); + for (auto &Mul : Muls) + LLVM_DEBUG(dbgs() << *Mul->Root << "\n" + << " " << *Mul->LHS << "\n" + << " " << *Mul->RHS << "\n"); + LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n") + ); + } }; class WidenedLoad { @@ -176,13 +208,11 @@ namespace { } }; - class ARMParallelDSP : public LoopPass { + class ARMParallelDSP : public FunctionPass { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; DominatorTree *DT; - LoopInfo *LI; - Loop *L; const DataLayout *DL; Module *M; std::map<LoadInst*, LoadInst*> LoadPairs; @@ -190,13 +220,12 @@ namespace { std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads; template<unsigned> - bool IsNarrowSequence(Value *V, ValueList &VL); - + bool IsNarrowSequence(Value *V); + bool Search(Value *V, BasicBlock *BB, Reduction &R); bool RecordMemoryOps(BasicBlock *BB); void InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, - IntegerType *LoadTy); + LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy); bool CreateParallelPairs(Reduction &R); /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate @@ -204,60 +233,38 @@ namespace { /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Loop *L); + bool MatchSMLAD(Function &F); public: static char ID; - ARMParallelDSP() : LoopPass(ID) { } - - bool doInitialization(Loop *L, LPPassManager &LPM) override { - LoadPairs.clear(); - WideLoads.clear(); - return true; - } + ARMParallelDSP() : FunctionPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { - LoopPass::getAnalysisUsage(AU); + FunctionPass::getAnalysisUsage(AU); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetPassConfig>(); - AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } - bool runOnLoop(Loop *TheLoop, LPPassManager &) override { + bool runOnFunction(Function &F) override { if (DisableParallelDSP) return false; - L = TheLoop; + if (skipFunction(F)) + return false; + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &TPC = getAnalysis<TargetPassConfig>(); - BasicBlock *Header = TheLoop->getHeader(); - if (!Header) - return false; - - // TODO: We assume the loop header and latch to be the same block. - // This is not a fundamental restriction, but lifting this would just - // require more work to do the transformation and then patch up the CFG. - if (Header != TheLoop->getLoopLatch()) { - LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not " - "running pass ARMParallelDSP\n"); - return false; - } - - if (!TheLoop->getLoopPreheader()) - InsertPreheaderForLoop(L, DT, LI, nullptr, true); - - Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -282,17 +289,10 @@ namespace { return false; } - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); - LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - if (!RecordMemoryOps(Header)) { - LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); - return false; - } - - bool Changes = MatchSMLAD(L); + bool Changes = MatchSMLAD(F); return Changes; } }; @@ -331,40 +331,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, // TODO: we currently only collect i16, and will support i8 later, so that's // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template<unsigned MaxBitWidth> -bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) { - ConstantInt *CInt; - - if (match(V, m_ConstantInt(CInt))) { - // TODO: if a constant is used, it needs to fit within the bit width. - return false; - } - - auto *I = dyn_cast<Instruction>(V); - if (!I) - return false; - - Value *Val, *LHS, *RHS; - if (match(V, m_Trunc(m_Value(Val)))) { - if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth) - return IsNarrowSequence<MaxBitWidth>(Val, VL); - } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) { - // TODO: we need to implement sadd16/sadd8 for this, which enables to - // also do the rewrite for smlad8.ll, but it is unsupported for now. - return false; - } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) { - if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) +bool ARMParallelDSP::IsNarrowSequence(Value *V) { + if (auto *SExt = dyn_cast<SExtInst>(V)) { + if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) return false; - if (match(Val, m_Load(m_Value()))) { - auto *Ld = cast<LoadInst>(Val); - - // Check that these load could be paired. - if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld)) - return false; - - VL.push_back(Val); - VL.push_back(I); - return true; + if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) { + // Check that this load could be paired. + return LoadPairs.count(Ld) || OffsetLoads.count(Ld); } } return false; @@ -375,6 +349,9 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) { bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector<LoadInst*, 8> Loads; SmallVector<Instruction*, 8> Writes; + LoadPairs.clear(); + WideLoads.clear(); + OrderedBasicBlock OrderedBB(BB); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -389,21 +366,24 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { Loads.push_back(Ld); } + if (Loads.empty() || Loads.size() > NumLoadLimit) + return false; + using InstSet = std::set<Instruction*>; using DepMap = std::map<Instruction*, InstSet>; DepMap RAWDeps; // Record any writes that may alias a load. const auto Size = LocationSize::unknown(); - for (auto Read : Loads) { - for (auto Write : Writes) { + for (auto Write : Writes) { + for (auto Read : Loads) { MemoryLocation ReadLoc = MemoryLocation(Read->getPointerOperand(), Size); if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), ModRefInfo::ModRef))) continue; - if (DT->dominates(Write, Read)) + if (OrderedBB.dominates(Write, Read)) RAWDeps[Read].insert(Write); } } @@ -411,17 +391,16 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // Check whether there's not a write between the two loads which would // prevent them from being safely merged. auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { - LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; - LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; + LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset; + LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base; if (RAWDeps.count(Dominated)) { InstSet &WritesBefore = RAWDeps[Dominated]; for (auto Before : WritesBefore) { - // We can't move the second load backward, past a write, to merge // with the first load. - if (DT->dominates(Dominator, Before)) + if (OrderedBB.dominates(Dominator, Before)) return false; } } @@ -431,7 +410,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // Record base, offset load pairs. for (auto *Base : Loads) { for (auto *Offset : Loads) { - if (Base == Offset) + if (Base == Offset || OffsetLoads.count(Offset)) continue; if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) && @@ -453,7 +432,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { return LoadPairs.size() > 1; } -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector +// Search recursively back through the operands to find a tree of values that +// form a multiply-accumulate chain. The search records the Add and Mul +// instructions that form the reduction and allows us to find a single value +// to be used as the initial input to the accumlator. +bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) { + // If we find a non-instruction, try to use it as the initial accumulator + // value. This may have already been found during the search in which case + // this function will return false, signaling a search fail. + auto *I = dyn_cast<Instruction>(V); + if (!I) + return R.InsertAcc(V); + + if (I->getParent() != BB) + return false; + + switch (I->getOpcode()) { + default: + break; + case Instruction::PHI: + // Could be the accumulator value. + return R.InsertAcc(V); + case Instruction::Add: { + // Adds should be adding together two muls, or another add and a mul to + // be within the mac chain. One of the operands may also be the + // accumulator value at which point we should stop searching. + R.InsertAdd(I); + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + bool ValidLHS = Search(LHS, BB, R); + bool ValidRHS = Search(RHS, BB, R); + + if (ValidLHS && ValidRHS) + return true; + + return R.InsertAcc(I); + } + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1); + } + case Instruction::SExt: + return Search(I->getOperand(0), BB, R); + } + return false; +} + +// The pass needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: @@ -484,88 +510,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -bool ARMParallelDSP::MatchSMLAD(Loop *L) { - // Search recursively back through the operands to find a tree of values that - // form a multiply-accumulate chain. The search records the Add and Mul - // instructions that form the reduction and allows us to find a single value - // to be used as the initial input to the accumlator. - std::function<bool(Value*, Reduction&)> Search = [&] - (Value *V, Reduction &R) -> bool { - - // If we find a non-instruction, try to use it as the initial accumulator - // value. This may have already been found during the search in which case - // this function will return false, signaling a search fail. - auto *I = dyn_cast<Instruction>(V); - if (!I) - return R.InsertAcc(V); - - switch (I->getOpcode()) { - default: - break; - case Instruction::PHI: - // Could be the accumulator value. - return R.InsertAcc(V); - case Instruction::Add: { - // Adds should be adding together two muls, or another add and a mul to - // be within the mac chain. One of the operands may also be the - // accumulator value at which point we should stop searching. - bool ValidLHS = Search(I->getOperand(0), R); - bool ValidRHS = Search(I->getOperand(1), R); - if (!ValidLHS && !ValidLHS) - return false; - else if (ValidLHS && ValidRHS) { - R.InsertAdd(I); - return true; - } else { - R.InsertAdd(I); - return R.InsertAcc(I); - } - } - case Instruction::Mul: { - Value *MulOp0 = I->getOperand(0); - Value *MulOp1 = I->getOperand(1); - if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) { - ValueList LHS; - ValueList RHS; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - R.InsertMul(I, LHS, RHS); - return true; - } - } - return false; - } - case Instruction::SExt: - return Search(I->getOperand(0), R); - } - return false; - }; - +bool ARMParallelDSP::MatchSMLAD(Function &F) { bool Changed = false; - SmallPtrSet<Instruction*, 4> AllAdds; - BasicBlock *Latch = L->getLoopLatch(); - for (Instruction &I : reverse(*Latch)) { - if (I.getOpcode() != Instruction::Add) + for (auto &BB : F) { + SmallPtrSet<Instruction*, 4> AllAdds; + if (!RecordMemoryOps(&BB)) continue; - if (AllAdds.count(&I)) - continue; + for (Instruction &I : reverse(BB)) { + if (I.getOpcode() != Instruction::Add) + continue; - const auto *Ty = I.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; + if (AllAdds.count(&I)) + continue; - Reduction R(&I); - if (!Search(&I, R)) - continue; + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; - if (!CreateParallelPairs(R)) - continue; + Reduction R(&I); + if (!Search(&I, &BB, R)) + continue; + + R.InsertMuls(); + LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump()); + + if (!CreateParallelPairs(R)) + continue; - InsertParallelMACs(R); - Changed = true; - AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + } } return Changed; @@ -578,87 +555,57 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { return false; // Check that the muls operate directly upon sign extended loads. - for (auto &MulChain : R.getMuls()) { - // A mul has 2 operands, and a narrow op consist of sext and a load; thus - // we expect at least 4 items in this operand value list. - if (MulChain->size() < 4) { - LLVM_DEBUG(dbgs() << "Operand list too short.\n"); + for (auto &MulCand : R.getMuls()) { + if (!MulCand->HasTwoLoadInputs()) return false; - } - MulChain->PopulateLoads(); - ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS; - ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS; - - // Use +=2 to skip over the expected extend instructions. - for (unsigned i = 0, e = LHS.size(); i < e; i += 2) { - if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i])) - return false; - } } - auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) { - if (!PMul0->AreSymmetrical(PMul1)) - return false; - + auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) { // The first elements of each vector should be loads with sexts. If we // find that its two pairs of consecutive loads, then these can be // transformed into two wider loads and the users can be replaced with // DSP intrinsics. - for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) { - auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]); - auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]); - auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]); - auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]); - - if (!Ld0 || !Ld1 || !Ld2 || !Ld3) - return false; - - LLVM_DEBUG(dbgs() << "Loads:\n" - << " - " << *Ld0 << "\n" - << " - " << *Ld1 << "\n" - << " - " << *Ld2 << "\n" - << " - " << *Ld3 << "\n"); + auto Ld0 = static_cast<LoadInst*>(PMul0->LHS); + auto Ld1 = static_cast<LoadInst*>(PMul1->LHS); + auto Ld2 = static_cast<LoadInst*>(PMul0->RHS); + auto Ld3 = static_cast<LoadInst*>(PMul1->RHS); - if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { - if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - R.AddMulPair(PMul0, PMul1); - return true; - } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); - PMul1->Exchange = true; - R.AddMulPair(PMul0, PMul1); - return true; - } - } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && - AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { + if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + R.AddMulPair(PMul0, PMul1); + return true; + } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); - LLVM_DEBUG(dbgs() << " and swapping muls\n"); - PMul0->Exchange = true; - // Only the second operand can be exchanged, so swap the muls. - R.AddMulPair(PMul1, PMul0); + LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); + R.AddMulPair(PMul0, PMul1, true); return true; } + } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && + AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); + LLVM_DEBUG(dbgs() << " and swapping muls\n"); + // Only the second operand can be exchanged, so swap the muls. + R.AddMulPair(PMul1, PMul0, true); + return true; } return false; }; - OpChainList &Muls = R.getMuls(); + MulCandList &Muls = R.getMuls(); const unsigned Elems = Muls.size(); - SmallPtrSet<const Instruction*, 4> Paired; for (unsigned i = 0; i < Elems; ++i) { - BinOpChain *PMul0 = static_cast<BinOpChain*>(Muls[i].get()); - if (Paired.count(PMul0->Root)) + MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get()); + if (PMul0->Paired) continue; for (unsigned j = 0; j < Elems; ++j) { if (i == j) continue; - BinOpChain *PMul1 = static_cast<BinOpChain*>(Muls[j].get()); - if (Paired.count(PMul1->Root)) + MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get()); + if (PMul1->Paired) continue; const Instruction *Mul0 = PMul0->Root; @@ -668,29 +615,19 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { assert(PMul0 != PMul1 && "expected different chains"); - if (CanPair(R, PMul0, PMul1)) { - Paired.insert(Mul0); - Paired.insert(Mul1); + if (CanPair(R, PMul0, PMul1)) break; - } } } return !R.getMulPairs().empty(); } - void ARMParallelDSP::InsertParallelMACs(Reduction &R) { - auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0, - SmallVectorImpl<LoadInst*> &VecLd1, - Value *Acc, bool Exchange, - Instruction *InsertAfter) { + auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, + Value *Acc, bool Exchange, + Instruction *InsertAfter) { // Replace the reduction chain with an intrinsic call - IntegerType *Ty = IntegerType::get(M->getContext(), 32); - LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ? - WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty); - LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ? - WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty); Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; @@ -704,34 +641,95 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); IRBuilder<NoFolder> Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + BasicBlock::iterator(InsertAfter)); Instruction *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; }; - Instruction *InsertAfter = R.getRoot(); + // Return the instruction after the dominated instruction. + auto GetInsertPoint = [this](Value *A, Value *B) { + assert((isa<Instruction>(A) || isa<Instruction>(B)) && + "expected at least one instruction"); + + Value *V = nullptr; + if (!isa<Instruction>(A)) + V = B; + else if (!isa<Instruction>(B)) + V = A; + else + V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A; + + return &*++BasicBlock::iterator(cast<Instruction>(V)); + }; + Value *Acc = R.getAccumulator(); - if (!Acc) - Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); - LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n" - << "Acc: " << *Acc << "\n"); + // For any muls that were discovered but not paired, accumulate their values + // as before. + IRBuilder<NoFolder> Builder(R.getRoot()->getParent()); + MulCandList &MulCands = R.getMuls(); + for (auto &MulCand : MulCands) { + if (MulCand->Paired) + continue; + + Instruction *Mul = cast<Instruction>(MulCand->Root); + LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); + + if (R.getType() != Mul->getType()) { + assert(R.is64Bit() && "expected 64-bit result"); + Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); + Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType())); + } + + if (!Acc) { + Acc = Mul; + continue; + } + + // If Acc is the original incoming value to the reduction, it could be a + // phi. But the phi will dominate Mul, meaning that Mul will be the + // insertion point. + Builder.SetInsertPoint(GetInsertPoint(Mul, Acc)); + Acc = Builder.CreateAdd(Mul, Acc); + } + + if (!Acc) { + Acc = R.is64Bit() ? + ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) : + ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); + } else if (Acc->getType() != R.getType()) { + Builder.SetInsertPoint(R.getRoot()); + Acc = Builder.CreateSExt(Acc, R.getType()); + } + + // Roughly sort the mul pairs in their program order. + OrderedBasicBlock OrderedBB(R.getRoot()->getParent()); + llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) { + const Instruction *A = PairA.first->Root; + const Instruction *B = PairB.first->Root; + return OrderedBB.dominates(A, B); + }); + + IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { - BinOpChain *PMul0 = Pair.first; - BinOpChain *PMul1 = Pair.second; - LLVM_DEBUG(dbgs() << "Muls:\n" - << "- " << *PMul0->Root << "\n" - << "- " << *PMul1->Root << "\n"); + MulCandidate *LHSMul = Pair.first; + MulCandidate *RHSMul = Pair.second; + LoadInst *BaseLHS = LHSMul->getBaseLoad(); + LoadInst *BaseRHS = RHSMul->getBaseLoad(); + LoadInst *WideLHS = WideLoads.count(BaseLHS) ? + WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty); + LoadInst *WideRHS = WideLoads.count(BaseRHS) ? + WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); - Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange, - InsertAfter); - InsertAfter = cast<Instruction>(Acc); + Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); + InsertAfter = GetInsertPoint(InsertAfter, Acc); + Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); } R.UpdateRoot(cast<Instruction>(Acc)); } -LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, +LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy) { assert(Loads.size() == 2 && "currently only support widening two loads"); @@ -758,8 +756,8 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, return; Source->moveBefore(Sink); - for (auto &U : Source->uses()) - MoveBefore(Source, U.getUser()); + for (auto &Op : Source->operands()) + MoveBefore(Op, Source); }; // Insert the load at the point of the original dominating load. @@ -784,57 +782,30 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - BaseSExt->setOperand(0, Bottom); + Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); + BaseSExt->replaceAllUsesWith(NewBaseSExt); IntegerType *OffsetTy = cast<IntegerType>(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - OffsetSExt->setOperand(0, Trunc); + Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); + OffsetSExt->replaceAllUsesWith(NewOffsetSExt); + LLVM_DEBUG(dbgs() << "From Base and Offset:\n" + << *Base << "\n" << *Offset << "\n" + << "Created Wide Load:\n" + << *WideLoad << "\n" + << *Bottom << "\n" + << *NewBaseSExt << "\n" + << *Top << "\n" + << *Trunc << "\n" + << *NewOffsetSExt << "\n"); WideLoads.emplace(std::make_pair(Base, - make_unique<WidenedLoad>(Loads, WideLoad))); + std::make_unique<WidenedLoad>(Loads, WideLoad))); return WideLoad; } -// Compare the value lists in Other to this chain. -bool BinOpChain::AreSymmetrical(BinOpChain *Other) { - // Element-by-element comparison of Value lists returning true if they are - // instructions with the same opcode or constants with the same value. - auto CompareValueList = [](const ValueList &VL0, - const ValueList &VL1) { - if (VL0.size() != VL1.size()) { - LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: " - << VL0.size() << " != " << VL1.size() << "\n"); - return false; - } - - const unsigned Pairs = VL0.size(); - - for (unsigned i = 0; i < Pairs; ++i) { - const Value *V0 = VL0[i]; - const Value *V1 = VL1[i]; - const auto *Inst0 = dyn_cast<Instruction>(V0); - const auto *Inst1 = dyn_cast<Instruction>(V1); - - if (!Inst0 || !Inst1) - return false; - - if (Inst0->isSameOperationAs(Inst1)) - continue; - - const APInt *C0, *C1; - if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) - return false; - } - - return true; - }; - - return CompareValueList(LHS, Other->LHS) && - CompareValueList(RHS, Other->RHS); -} - Pass *llvm::createARMParallelDSPPass() { return new ARMParallelDSP(); } @@ -842,6 +813,6 @@ Pass *llvm::createARMParallelDSPPass() { char ARMParallelDSP::ID = 0; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td index 0b6b40de80dd..b008d3e2e296 100644 --- a/lib/Target/ARM/ARMPredicates.td +++ b/lib/Target/ARM/ARMPredicates.td @@ -71,7 +71,7 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">, - AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">; + AssemblerPredicate<"FeatureVFP2_SP", "VFP2">; def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">, AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">, diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 92ae26b3729d..56055a15483a 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -180,7 +180,7 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>; // models the APSR when it's accessed by some special instructions. In such cases // it has the same encoding as PC. def CPSR : ARMReg<0, "cpsr">; -def APSR : ARMReg<1, "apsr">; +def APSR : ARMReg<15, "apsr">; def APSR_NZCV : ARMReg<15, "apsr_nzcv">; def SPSR : ARMReg<2, "spsr">; def FPSCR : ARMReg<3, "fpscr">; @@ -486,12 +486,20 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], // Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP. // These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs. -def Tuples2R : RegisterTuples<[gsub_0, gsub_1], - [(add R0, R2, R4, R6, R8, R10, R12), - (add R1, R3, R5, R7, R9, R11, SP)]>; +def Tuples2Rnosp : RegisterTuples<[gsub_0, gsub_1], + [(add R0, R2, R4, R6, R8, R10), + (add R1, R3, R5, R7, R9, R11)]>; + +def Tuples2Rsp : RegisterTuples<[gsub_0, gsub_1], + [(add R12), (add SP)]>; // Register class representing a pair of even-odd GPRs. -def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> { +def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp, Tuples2Rsp)> { + let Size = 64; // 2 x 32 bits, we have no predefined type of that size. +} + +// Register class representing a pair of even-odd GPRs, except (R12, SP). +def GPRPairnosp : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp)> { let Size = 64; // 2 x 32 bits, we have no predefined type of that size. } diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 21d32bde4710..3f0b71afd977 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -2239,9 +2239,9 @@ def A9WriteLMfpPostRA : SchedWriteVariant<[ // Distinguish between our multiple MI-level forms of the same // VLDM/VSTM instructions. def A9PreRA : SchedPredicate< - "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">; + "Register::isVirtualRegister(MI->getOperand(0).getReg())">; def A9PostRA : SchedPredicate< - "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">; + "Register::isPhysicalRegister(MI->getOperand(0).getReg())">; // VLDM represents all destination registers as a single register // tuple, unlike LDM. So the number of write operands is not variadic. diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td index 38c8ea2b4f35..bfa5fc0d7131 100644 --- a/lib/Target/ARM/ARMScheduleM4.td +++ b/lib/Target/ARM/ARMScheduleM4.td @@ -18,6 +18,9 @@ def CortexM4Model : SchedMachineModel { let PostRAScheduler = 1; let CompleteModel = 0; + let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt, + IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8, + HasV8_3a, HasTrustZone, HasDFB, IsWindows]; } @@ -50,6 +53,7 @@ def : M4UnitL2<WriteMAC16>; def : M4UnitL2<WriteDIV>; def : M4UnitL2I<(instregex "(t|t2)LDM")>; +def : M4UnitL2I<(instregex "(t|t2)LDR")>; // Stores we use a latency of 1 as they have no outputs @@ -78,9 +82,20 @@ def : M4UnitL1<WriteNoop>; def : M4UnitL1<WritePreLd>; def : M4UnitL1I<(instregex "(t|t2)MOV")>; def : M4UnitL1I<(instrs COPY)>; -def : M4UnitL1I<(instregex "t2IT")>; -def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", - "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>; +def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>; +def : M4UnitL1I<(instregex "t2CLREX")>; +def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]", + "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>; + +// These instructions are not of much interest to scheduling as they will not +// be generated or it is not very useful to schedule them. They are here to make +// the model more complete. +def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>; +def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>; +def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>; +def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>; +def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>; +def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>; def : ReadAdvance<ReadALU, 0>; def : ReadAdvance<ReadALUsr, 0>; @@ -112,6 +127,9 @@ def : M4UnitL1<WriteVST1>; def : M4UnitL1<WriteVST2>; def : M4UnitL1<WriteVST3>; def : M4UnitL1<WriteVST4>; +def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>; +def : M4UnitL2I<(instregex "VMOVD")>; +def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>; def : ReadAdvance<ReadFPMUL, 0>; def : ReadAdvance<ReadFPMAC, 0>; diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 978faed776b0..09603057b2c8 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -125,7 +125,7 @@ const CallLowering *ARMSubtarget::getCallLowering() const { return CallLoweringInfo.get(); } -const InstructionSelector *ARMSubtarget::getInstructionSelector() const { +InstructionSelector *ARMSubtarget::getInstructionSelector() const { return InstSelector.get(); } @@ -205,9 +205,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { NoARM = true; if (isAAPCS_ABI()) - stackAlignment = 8; + stackAlignment = Align(8); if (isTargetNaCl() || isAAPCS16_ABI()) - stackAlignment = 16; + stackAlignment = Align(16); // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as @@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isRWPI()) ReserveR9 = true; + // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2 + if (MVEVectorCostFactor == 0) + MVEVectorCostFactor = 2; + // FIXME: Teach TableGen to deal with these instead of doing it manually here. switch (ARMProcFamily) { case Others: @@ -296,13 +300,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { LdStMultipleTiming = SingleIssuePlusExtras; MaxInterleaveFactor = 4; if (!isThumb()) - PrefLoopAlignment = 3; + PrefLoopLogAlignment = 3; break; case Kryo: break; case Krait: PreISelOperandLatencyAdjustment = 1; break; + case NeoverseN1: + break; case Swift: MaxInterleaveFactor = 2; LdStMultipleTiming = SingleIssuePlusExtras; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index c2b0f052b843..ef460342a69e 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -71,6 +71,7 @@ protected: Exynos, Krait, Kryo, + NeoverseN1, Swift }; enum ARMProcClassEnum { @@ -179,11 +180,9 @@ protected: bool HasVFPv3SP = false; bool HasVFPv4SP = false; bool HasFPARMv8SP = false; - bool HasVFPv2D16 = false; bool HasVFPv3D16 = false; bool HasVFPv4D16 = false; bool HasFPARMv8D16 = false; - bool HasVFPv2D16SP = false; bool HasVFPv3D16SP = false; bool HasVFPv4D16SP = false; bool HasFPARMv8D16SP = false; @@ -450,7 +449,7 @@ protected: /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. - unsigned stackAlignment = 4; + Align stackAlignment = Align(4); /// CPUString - String name of used CPU. std::string CPUString; @@ -469,7 +468,12 @@ protected: int PreISelOperandLatencyAdjustment = 2; /// What alignment is preferred for loop bodies, in log2(bytes). - unsigned PrefLoopAlignment = 0; + unsigned PrefLoopLogAlignment = 0; + + /// The cost factor for MVE instructions, representing the multiple beats an + // instruction can take. The default is 2, (set in initSubtargetFeatures so + // that we can use subtarget features less than 2). + unsigned MVEVectorCostFactor = 0; /// OptMinSize - True if we're optimising for minimum code size, equal to /// the function attribute. @@ -535,7 +539,7 @@ public: } const CallLowering *getCallLowering() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; @@ -600,7 +604,7 @@ public: bool hasARMOps() const { return !NoARM; } - bool hasVFP2Base() const { return HasVFPv2D16SP; } + bool hasVFP2Base() const { return HasVFPv2SP; } bool hasVFP3Base() const { return HasVFPv3D16SP; } bool hasVFP4Base() const { return HasVFPv4D16SP; } bool hasFPARMv8Base() const { return HasFPARMv8D16SP; } @@ -668,6 +672,12 @@ public: bool hasSB() const { return HasSB; } bool genLongCalls() const { return GenLongCalls; } bool genExecuteOnly() const { return GenExecuteOnly; } + bool hasBaseDSP() const { + if (isThumb()) + return hasDSP(); + else + return hasV5TEOps(); + } bool hasFP16() const { return HasFP16; } bool hasD32() const { return HasD32; } @@ -812,7 +822,7 @@ public: /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. - unsigned getStackAlignment() const { return stackAlignment; } + Align getStackAlignment() const { return stackAlignment; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } @@ -853,9 +863,9 @@ public: return isROPI() || !isTargetELF(); } - unsigned getPrefLoopAlignment() const { - return PrefLoopAlignment; - } + unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; } + + unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; } bool ignoreCSRForAllocationOrder(const MachineFunction &MF, unsigned PhysReg) const override; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 7f0aae1739b3..5c8007f101d9 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -96,15 +96,16 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMExpandPseudoPass(Registry); initializeThumb2SizeReducePass(Registry); initializeMVEVPTBlockPass(Registry); + initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) - return llvm::make_unique<TargetLoweringObjectFileMachO>(); + return std::make_unique<TargetLoweringObjectFileMachO>(); if (TT.isOSWindows()) - return llvm::make_unique<TargetLoweringObjectFileCOFF>(); - return llvm::make_unique<ARMElfTargetObjectFile>(); + return std::make_unique<TargetLoweringObjectFileCOFF>(); + return std::make_unique<ARMElfTargetObjectFile>(); } static ARMBaseTargetMachine::ARMABI @@ -282,7 +283,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle, + I = std::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle, F.hasMinSize()); if (!I->isThumb() && !I->hasARMOps()) @@ -447,8 +448,10 @@ bool ARMPassConfig::addPreISel() { MergeExternalByDefault)); } - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createHardwareLoopsPass()); + addPass(createMVETailPredicationPass()); + } return false; } diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 2a8ec734a05f..86c8684d14dc 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -36,8 +36,12 @@ using namespace llvm; #define DEBUG_TYPE "armtti" +static cl::opt<bool> EnableMaskedLoadStores( + "enable-arm-maskedldst", cl::Hidden, cl::init(false), + cl::desc("Enable the generation of masked loads and stores")); + static cl::opt<bool> DisableLowOverheadLoops( - "disable-arm-loloops", cl::Hidden, cl::init(true), + "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); bool ARMTTIImpl::areInlineCompatible(const Function *Caller, @@ -167,6 +171,42 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); + // The extend of a load is free + if (I && isa<LoadInst>(I->getOperand(0))) { + static const TypeConversionCostTblEntry LoadConversionTbl[] = { + {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, + {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, + {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, + {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, + {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, + {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, + {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, + {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, + {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, + {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, + {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, + {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, + }; + if (const auto *Entry = ConvertCostTableLookup( + LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return Entry->Cost; + + static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { + {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, + {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, + {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, + {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, + {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, + {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, + }; + if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVELoadConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return Entry->Cost; + } + } + // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. // TODO: Get these tables to know at least what the related operations are. @@ -313,6 +353,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } + // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one + // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext + // are linearised so take more. + static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, + }; + + if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { + if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, + ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost * ST->getMVEVectorCostFactor(); + } + // Scalar integer conversion costs. static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. @@ -332,7 +397,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -343,8 +411,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return 3; - if ((Opcode == Instruction::InsertElement || - Opcode == Instruction::ExtractElement)) { + if (ST->hasNEON() && (Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement)) { // Cross-class copies are expensive on many microarchitectures, // so assume they are expensive by default. if (ValTy->getVectorElementType()->isIntegerTy()) @@ -357,6 +425,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); } + if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement)) { + // We say MVE moves costs at least the MVEVectorCostFactor, even though + // they are scalar instructions. This helps prevent mixing scalar and + // vector, to prevent vectorising where we end up just scalarising the + // result anyway. + return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), + ST->getMVEVectorCostFactor()) * + ValTy->getVectorNumElements() / 2; + } + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } @@ -385,7 +464,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, return LT.first; } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -397,13 +479,37 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, unsigned NumVectorInstToHideOverhead = 10; int MaxMergeDistance = 64; - if (Ty->isVectorTy() && SE && - !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) - return NumVectorInstToHideOverhead; + if (ST->hasNEON()) { + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) + return NumVectorInstToHideOverhead; - // In many cases the address computation is not merged into the instruction - // addressing mode. - return 1; + // In many cases the address computation is not merged into the instruction + // addressing mode. + return 1; + } + return BaseT::getAddressComputationCost(Ty, SE, Ptr); +} + +bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { + if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) + return false; + + if (auto *VecTy = dyn_cast<VectorType>(DataTy)) { + // Don't support v2i1 yet. + if (VecTy->getNumElements() == 2) + return false; + + // We don't support extending fp types. + unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); + if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) + return false; + } + + unsigned EltWidth = DataTy->getScalarSizeInBits(); + return (EltWidth == 32 && (!Alignment || Alignment >= 4)) || + (EltWidth == 16 && (!Alignment || Alignment >= 2)) || + (EltWidth == 8); } int ARMTTIImpl::getMemcpyCost(const Instruction *I) { @@ -442,78 +548,96 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) { int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Broadcast) { - static const CostTblEntry NEONDupTbl[] = { - // VDUP handles these cases. - {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, + if (ST->hasNEON()) { + if (Kind == TTI::SK_Broadcast) { + static const CostTblEntry NEONDupTbl[] = { + // VDUP handles these cases. + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, - LT.second)) - return LT.first * Entry->Cost; + if (const auto *Entry = + CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + if (Kind == TTI::SK_Reverse) { + static const CostTblEntry NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - } - if (Kind == TTI::SK_Reverse) { - static const CostTblEntry NEONShuffleTbl[] = { - // Reverse shuffle cost one instruction if we are shuffling within a - // double word (vrev) or two if we shuffle a quad word (vrev, vext). - {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = + CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + if (Kind == TTI::SK_Select) { + static const CostTblEntry NEONSelShuffleTbl[] = { + // Select shuffle cost table for ARM. Cost is the number of + // instructions + // required to create the shuffled vector. - if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, - LT.second)) - return LT.first * Entry->Cost; + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - } - if (Kind == TTI::SK_Select) { - static const CostTblEntry NEONSelShuffleTbl[] = { - // Select shuffle cost table for ARM. Cost is the number of instructions - // required to create the shuffled vector. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + } + if (ST->hasMVEIntegerOps()) { + if (Kind == TTI::SK_Broadcast) { + static const CostTblEntry MVEDupTbl[] = { + // VDUP handles these cases. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); + } } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } int ARMTTIImpl::getArithmeticInstrCost( @@ -567,38 +691,64 @@ int ARMTTIImpl::getArithmeticInstrCost( // Multiplication. }; - if (ST->hasNEON()) + if (ST->hasNEON()) { if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) return LT.first * Entry->Cost; - int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + + // This is somewhat of a hack. The problem that we are facing is that SROA + // creates a sequence of shift, and, or instructions to construct values. + // These sequences are recognized by the ISel and have zero-cost. Not so for + // the vectorized code. Because we have support for v2i64 but not i64 those + // sequences look particularly beneficial to vectorize. + // To work around this we increase the cost of v2i64 operations to make them + // seem less beneficial. + if (LT.second == MVT::v2i64 && + Op2Info == TargetTransformInfo::OK_UniformConstantValue) + Cost += 4; - // This is somewhat of a hack. The problem that we are facing is that SROA - // creates a sequence of shift, and, or instructions to construct values. - // These sequences are recognized by the ISel and have zero-cost. Not so for - // the vectorized code. Because we have support for v2i64 but not i64 those - // sequences look particularly beneficial to vectorize. - // To work around this we increase the cost of v2i64 operations to make them - // seem less beneficial. - if (LT.second == MVT::v2i64 && - Op2Info == TargetTransformInfo::OK_UniformConstantValue) - Cost += 4; + return Cost; + } + + int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; - return Cost; + // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, + // without treating floats as more expensive that scalars or increasing the + // costs for custom operations. The results is also multiplied by the + // MVEVectorCostFactor where appropriate. + if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) + return LT.first * BaseCost; + + // Else this is expand, assume that we need to scalarize this op. + if (Ty->isVectorTy()) { + unsigned Num = Ty->getVectorNumElements(); + unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost; + } + + return BaseCost; } int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - if (Src->isVectorTy() && Alignment != 16 && + if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isDoubleTy()) { // Unaligned loads/stores are extremely inefficient. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. return LT.first * 4; } - return LT.first; + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * LT.first; } int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, @@ -893,6 +1043,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, } return; } + // Don't unroll vectorised loop. MVE does not benefit from it as much as + // scalar code. + if (I.getType()->isVectorTy()) + return; + SmallVector<const Value*, 4> Operands(I.value_op_begin(), I.value_op_end()); Cost += getUserCost(&I, Operands); @@ -914,3 +1069,28 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (Cost < 12) UP.Force = true; } + +bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + if (!ST->hasMVEIntegerOps()) + return false; + + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + case Instruction::FCmp: + return false; + case Instruction::ICmp: + case Instruction::Add: + return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 52f6ea4a6e2f..a878fdcfe3c7 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -101,9 +101,9 @@ public: /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD - /// is IEEE-754 compliant, but it's not covered in this target. + /// and Arm MVE are IEEE-754 compliant. bool isFPVectorizationPotentiallyUnsafe() { - return !ST->isTargetDarwin(); + return !ST->isTargetDarwin() && !ST->hasMVEFloatOps(); } /// \name Scalar TTI Implementations @@ -122,10 +122,13 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 16; + if (ST->hasMVEIntegerOps()) + return 8; return 0; } @@ -138,6 +141,8 @@ public: if (Vector) { if (ST->hasNEON()) return 128; + if (ST->hasMVEIntegerOps()) + return 128; return 0; } @@ -148,10 +153,23 @@ public: return ST->getMaxInterleaveFactor(); } + bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment); + + bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) { + return isLegalMaskedLoad(DataTy, Alignment); + } + int getMemcpyCost(const Instruction *I); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 1da9452f1d22..d2c355c1da75 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -2275,6 +2275,14 @@ public: return Value >= 1 && Value <= 32; } + bool isMveSaturateOp() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + return Value == 48 || Value == 64; + } + bool isITCondCodeNoAL() const { if (!isITCondCode()) return false; ARMCC::CondCodes CC = getCondCode(); @@ -2479,28 +2487,28 @@ public: void addModImmNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue()); Inst.addOperand(MCOperand::createImm(Enc)); } void addModImmNegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue()); Inst.addOperand(MCOperand::createImm(Enc)); } void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Val = -CE->getValue(); Inst.addOperand(MCOperand::createImm(Val)); } void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Val = -CE->getValue(); Inst.addOperand(MCOperand::createImm(Val)); } @@ -2523,19 +2531,19 @@ public: void addFBits16Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(16 - CE->getValue())); } void addFBits32Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(32 - CE->getValue())); } void addFPImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue())); Inst.addOperand(MCOperand::createImm(Val)); } @@ -2544,7 +2552,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // FIXME: We really want to scale the value here, but the LDRD/STRD // instruction don't encode operands that way yet. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } @@ -2552,35 +2560,31 @@ public: assert(N == 1 && "Invalid number of operands!"); // FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR // instruction don't encode operands that way yet. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Shift0Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Shift1Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Shift2Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } @@ -2588,7 +2592,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored // in the MCInst as such. Lop off the low two bits here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() / 4)); } @@ -2596,7 +2600,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored // in the MCInst as such. Lop off the low two bits here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4))); } @@ -2604,7 +2608,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored // in the MCInst as such. Lop off the low two bits here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() / 4)); } @@ -2612,7 +2616,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The constant encodes as the immediate-1, and we store in the instruction // the bits as encoded, so subtract off one here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() - 1)); } @@ -2620,7 +2624,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The constant encodes as the immediate-1, and we store in the instruction // the bits as encoded, so subtract off one here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() - 1)); } @@ -2628,7 +2632,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The constant encodes as the immediate, except for 32, which encodes as // zero. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Imm = CE->getValue(); Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm))); } @@ -2637,7 +2641,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // An ASR value of 32 encodes as 0, so that's how we want to add it to // the instruction as well. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); int Val = CE->getValue(); Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val)); } @@ -2646,7 +2650,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The operand is actually a t2_so_imm, but we have its bitwise // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue())); } @@ -2654,7 +2658,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The operand is actually a t2_so_imm, but we have its // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue())); } @@ -2662,7 +2666,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The operand is actually an imm0_4095, but we have its // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue())); } @@ -2671,9 +2675,7 @@ public: Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2)); return; } - - const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); - assert(SR && "Unknown value type!"); + const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val); Inst.addOperand(MCOperand::createExpr(SR)); } @@ -2685,10 +2687,7 @@ public: Inst.addOperand(MCOperand::createImm(CE->getValue())); return; } - - const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); - - assert(SR && "Unknown value type!"); + const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val); Inst.addOperand(MCOperand::createExpr(SR)); return; } @@ -2750,7 +2749,7 @@ public: return; } - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); int Val = CE->getValue(); Inst.addOperand(MCOperand::createImm(Val)); } @@ -3130,7 +3129,7 @@ public: void addPowerTwoOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } @@ -3225,14 +3224,14 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. // Mask in that this is an i8 splat. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00)); } void addNEONi16splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi16splat(Value); Inst.addOperand(MCOperand::createImm(Value)); @@ -3241,7 +3240,7 @@ public: void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff); Inst.addOperand(MCOperand::createImm(Value)); @@ -3250,7 +3249,7 @@ public: void addNEONi32splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi32splat(Value); Inst.addOperand(MCOperand::createImm(Value)); @@ -3259,7 +3258,7 @@ public: void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi32splat(~Value); Inst.addOperand(MCOperand::createImm(Value)); @@ -3267,7 +3266,7 @@ public: void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const { // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); assert((Inst.getOpcode() == ARM::VMOVv8i8 || Inst.getOpcode() == ARM::VMOVv16i8) && "All instructions that wants to replicate non-zero byte " @@ -3298,7 +3297,7 @@ public: void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = encodeNeonVMOVImmediate(CE->getValue()); Inst.addOperand(MCOperand::createImm(Value)); } @@ -3310,7 +3309,7 @@ public: void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); assert((Inst.getOpcode() == ARM::VMOVv4i16 || Inst.getOpcode() == ARM::VMOVv8i16 || Inst.getOpcode() == ARM::VMVNv4i16 || @@ -3327,14 +3326,14 @@ public: void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = encodeNeonVMOVImmediate(~CE->getValue()); Inst.addOperand(MCOperand::createImm(Value)); } void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); assert((Inst.getOpcode() == ARM::VMOVv2i32 || Inst.getOpcode() == ARM::VMOVv4i32 || Inst.getOpcode() == ARM::VMVNv2i32 || @@ -3349,7 +3348,7 @@ public: void addNEONi64splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint64_t Value = CE->getValue(); unsigned Imm = 0; for (unsigned i = 0; i < 8; ++i, Value >>= 8) { @@ -3360,20 +3359,28 @@ public: void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() / 90)); } void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm((CE->getValue() - 90) / 180)); } + void addMveSaturateOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); + unsigned Imm = CE->getValue(); + assert((Imm == 48 || Imm == 64) && "Invalid saturate operand"); + Inst.addOperand(MCOperand::createImm(Imm == 48 ? 1 : 0)); + } + void print(raw_ostream &OS) const override; static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_ITCondMask); + auto Op = std::make_unique<ARMOperand>(k_ITCondMask); Op->ITMask.Mask = Mask; Op->StartLoc = S; Op->EndLoc = S; @@ -3382,7 +3389,7 @@ public: static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CondCode); + auto Op = std::make_unique<ARMOperand>(k_CondCode); Op->CC.Val = CC; Op->StartLoc = S; Op->EndLoc = S; @@ -3391,7 +3398,7 @@ public: static std::unique_ptr<ARMOperand> CreateVPTPred(ARMVCC::VPTCodes CC, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_VPTPred); + auto Op = std::make_unique<ARMOperand>(k_VPTPred); Op->VCC.Val = CC; Op->StartLoc = S; Op->EndLoc = S; @@ -3399,7 +3406,7 @@ public: } static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CoprocNum); + auto Op = std::make_unique<ARMOperand>(k_CoprocNum); Op->Cop.Val = CopVal; Op->StartLoc = S; Op->EndLoc = S; @@ -3407,7 +3414,7 @@ public: } static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CoprocReg); + auto Op = std::make_unique<ARMOperand>(k_CoprocReg); Op->Cop.Val = CopVal; Op->StartLoc = S; Op->EndLoc = S; @@ -3416,7 +3423,7 @@ public: static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_CoprocOption); + auto Op = std::make_unique<ARMOperand>(k_CoprocOption); Op->Cop.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -3424,7 +3431,7 @@ public: } static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CCOut); + auto Op = std::make_unique<ARMOperand>(k_CCOut); Op->Reg.RegNum = RegNum; Op->StartLoc = S; Op->EndLoc = S; @@ -3432,7 +3439,7 @@ public: } static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_Token); + auto Op = std::make_unique<ARMOperand>(k_Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -3442,7 +3449,7 @@ public: static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_Register); + auto Op = std::make_unique<ARMOperand>(k_Register); Op->Reg.RegNum = RegNum; Op->StartLoc = S; Op->EndLoc = E; @@ -3453,7 +3460,7 @@ public: CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, unsigned ShiftReg, unsigned ShiftImm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ShiftedRegister); + auto Op = std::make_unique<ARMOperand>(k_ShiftedRegister); Op->RegShiftedReg.ShiftTy = ShTy; Op->RegShiftedReg.SrcReg = SrcReg; Op->RegShiftedReg.ShiftReg = ShiftReg; @@ -3466,7 +3473,7 @@ public: static std::unique_ptr<ARMOperand> CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, unsigned ShiftImm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ShiftedImmediate); + auto Op = std::make_unique<ARMOperand>(k_ShiftedImmediate); Op->RegShiftedImm.ShiftTy = ShTy; Op->RegShiftedImm.SrcReg = SrcReg; Op->RegShiftedImm.ShiftImm = ShiftImm; @@ -3477,7 +3484,7 @@ public: static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ShifterImmediate); + auto Op = std::make_unique<ARMOperand>(k_ShifterImmediate); Op->ShifterImm.isASR = isASR; Op->ShifterImm.Imm = Imm; Op->StartLoc = S; @@ -3487,7 +3494,7 @@ public: static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_RotateImmediate); + auto Op = std::make_unique<ARMOperand>(k_RotateImmediate); Op->RotImm.Imm = Imm; Op->StartLoc = S; Op->EndLoc = E; @@ -3496,7 +3503,7 @@ public: static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ModifiedImmediate); + auto Op = std::make_unique<ARMOperand>(k_ModifiedImmediate); Op->ModImm.Bits = Bits; Op->ModImm.Rot = Rot; Op->StartLoc = S; @@ -3506,7 +3513,7 @@ public: static std::unique_ptr<ARMOperand> CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate); + auto Op = std::make_unique<ARMOperand>(k_ConstantPoolImmediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -3515,7 +3522,7 @@ public: static std::unique_ptr<ARMOperand> CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor); + auto Op = std::make_unique<ARMOperand>(k_BitfieldDescriptor); Op->Bitfield.LSB = LSB; Op->Bitfield.Width = Width; Op->StartLoc = S; @@ -3543,16 +3550,15 @@ public: Kind = k_SPRRegisterList; } - // Sort based on the register encoding values. - array_pod_sort(Regs.begin(), Regs.end()); - if (Kind == k_RegisterList && Regs.back().second == ARM::APSR) Kind = k_RegisterListWithAPSR; - auto Op = make_unique<ARMOperand>(Kind); - for (SmallVectorImpl<std::pair<unsigned, unsigned>>::const_iterator - I = Regs.begin(), E = Regs.end(); I != E; ++I) - Op->Registers.push_back(I->second); + assert(std::is_sorted(Regs.begin(), Regs.end()) && + "Register list must be sorted by encoding"); + + auto Op = std::make_unique<ARMOperand>(Kind); + for (const auto &P : Regs) + Op->Registers.push_back(P.second); Op->StartLoc = StartLoc; Op->EndLoc = EndLoc; @@ -3563,7 +3569,7 @@ public: unsigned Count, bool isDoubleSpaced, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_VectorList); + auto Op = std::make_unique<ARMOperand>(k_VectorList); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.isDoubleSpaced = isDoubleSpaced; @@ -3575,7 +3581,7 @@ public: static std::unique_ptr<ARMOperand> CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_VectorListAllLanes); + auto Op = std::make_unique<ARMOperand>(k_VectorListAllLanes); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.isDoubleSpaced = isDoubleSpaced; @@ -3587,7 +3593,7 @@ public: static std::unique_ptr<ARMOperand> CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index, bool isDoubleSpaced, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_VectorListIndexed); + auto Op = std::make_unique<ARMOperand>(k_VectorListIndexed); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.LaneIndex = Index; @@ -3599,7 +3605,7 @@ public: static std::unique_ptr<ARMOperand> CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<ARMOperand>(k_VectorIndex); + auto Op = std::make_unique<ARMOperand>(k_VectorIndex); Op->VectorIndex.Val = Idx; Op->StartLoc = S; Op->EndLoc = E; @@ -3608,7 +3614,7 @@ public: static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_Immediate); + auto Op = std::make_unique<ARMOperand>(k_Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -3620,7 +3626,7 @@ public: unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType, unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S, SMLoc E, SMLoc AlignmentLoc = SMLoc()) { - auto Op = make_unique<ARMOperand>(k_Memory); + auto Op = std::make_unique<ARMOperand>(k_Memory); Op->Memory.BaseRegNum = BaseRegNum; Op->Memory.OffsetImm = OffsetImm; Op->Memory.OffsetRegNum = OffsetRegNum; @@ -3637,7 +3643,7 @@ public: static std::unique_ptr<ARMOperand> CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy, unsigned ShiftImm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_PostIndexRegister); + auto Op = std::make_unique<ARMOperand>(k_PostIndexRegister); Op->PostIdxReg.RegNum = RegNum; Op->PostIdxReg.isAdd = isAdd; Op->PostIdxReg.ShiftTy = ShiftTy; @@ -3649,7 +3655,7 @@ public: static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_MemBarrierOpt); + auto Op = std::make_unique<ARMOperand>(k_MemBarrierOpt); Op->MBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; @@ -3658,7 +3664,7 @@ public: static std::unique_ptr<ARMOperand> CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt); + auto Op = std::make_unique<ARMOperand>(k_InstSyncBarrierOpt); Op->ISBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; @@ -3667,7 +3673,7 @@ public: static std::unique_ptr<ARMOperand> CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt); + auto Op = std::make_unique<ARMOperand>(k_TraceSyncBarrierOpt); Op->TSBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; @@ -3676,7 +3682,7 @@ public: static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_ProcIFlags); + auto Op = std::make_unique<ARMOperand>(k_ProcIFlags); Op->IFlags.Val = IFlags; Op->StartLoc = S; Op->EndLoc = S; @@ -3684,7 +3690,7 @@ public: } static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_MSRMask); + auto Op = std::make_unique<ARMOperand>(k_MSRMask); Op->MMask.Val = MMask; Op->StartLoc = S; Op->EndLoc = S; @@ -3692,7 +3698,7 @@ public: } static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_BankedReg); + auto Op = std::make_unique<ARMOperand>(k_BankedReg); Op->BankedReg.Val = Reg; Op->StartLoc = S; Op->EndLoc = S; @@ -4253,6 +4259,24 @@ static unsigned getNextRegister(unsigned Reg) { } } +// Insert an <Encoding, Register> pair in an ordered vector. Return true on +// success, or false, if duplicate encoding found. +static bool +insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs, + unsigned Enc, unsigned Reg) { + Regs.emplace_back(Enc, Reg); + for (auto I = Regs.rbegin(), J = I + 1, E = Regs.rend(); J != E; ++I, ++J) { + if (J->first == Enc) { + Regs.erase(J.base()); + return false; + } + if (J->first < Enc) + break; + std::swap(*I, *J); + } + return true; +} + /// Parse a register list. bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder) { @@ -4278,7 +4302,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { Reg = getDRegFromQReg(Reg); EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + Registers.emplace_back(EReg, Reg); ++Reg; } const MCRegisterClass *RC; @@ -4295,7 +4319,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, // Store the register. EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + Registers.emplace_back(EReg, Reg); // This starts immediately after the first register token in the list, // so we can see either a comma or a minus (range separator) as a legal @@ -4326,7 +4350,11 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, while (Reg != EndReg) { Reg = getNextRegister(Reg); EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (!insertNoDuplicates(Registers, EReg, Reg)) { + Warning(AfterMinusLoc, StringRef("duplicated register (") + + ARMInstPrinter::getRegisterName(Reg) + + ") in register list"); + } } continue; } @@ -4350,11 +4378,16 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, // subset of GPRRegClassId except it contains APSR as well. RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID]; } - if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] || - RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) { + if (Reg == ARM::VPR && + (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] || + RC == &ARMMCRegisterClasses[ARM::DPRRegClassID] || + RC == &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID])) { RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID]; EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (!insertNoDuplicates(Registers, EReg, Reg)) { + Warning(RegLoc, "duplicated register (" + RegTok.getString() + + ") in register list"); + } continue; } // The register must be in the same register class as the first. @@ -4371,21 +4404,19 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) return Error(RegLoc, "register list not in ascending order"); } - if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) { - Warning(RegLoc, "duplicated register (" + RegTok.getString() + - ") in register list"); - continue; - } // VFP register lists must also be contiguous. if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] && RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] && Reg != OldReg + 1) return Error(RegLoc, "non-contiguous register range"); EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (!insertNoDuplicates(Registers, EReg, Reg)) { + Warning(RegLoc, "duplicated register (" + RegTok.getString() + + ") in register list"); + } if (isQReg) { EReg = MRI->getEncodingValue(++Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + Registers.emplace_back(EReg, Reg); } } @@ -5702,14 +5733,16 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { return false; } - // If we have a '#', it's an immediate offset, else assume it's a register - // offset. Be friendly and also accept a plain integer (without a leading - // hash) for gas compatibility. + // If we have a '#' or '$', it's an immediate offset, else assume it's a + // register offset. Be friendly and also accept a plain integer or expression + // (without a leading hash) for gas compatibility. if (Parser.getTok().is(AsmToken::Hash) || Parser.getTok().is(AsmToken::Dollar) || + Parser.getTok().is(AsmToken::LParen) || Parser.getTok().is(AsmToken::Integer)) { - if (Parser.getTok().isNot(AsmToken::Integer)) - Parser.Lex(); // Eat '#' or '$'. + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar)) + Parser.Lex(); // Eat '#' or '$' E = Parser.getTok().getLoc(); bool isNegative = getParser().getTok().is(AsmToken::Minus); @@ -11308,7 +11341,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) { SmallVector<uint8_t, 16> Opcodes; auto parseOne = [&]() -> bool { - const MCExpr *OE; + const MCExpr *OE = nullptr; SMLoc OpcodeLoc = getLexer().getLoc(); if (check(getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE), @@ -11694,14 +11727,14 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { { ARM::AEK_CRYPTO, {Feature_HasV8Bit}, {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, { ARM::AEK_FP, {Feature_HasV8Bit}, - {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} }, + {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} }, { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), {Feature_HasV7Bit, Feature_IsNotMClassBit}, {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} }, { ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit}, {ARM::FeatureMP} }, { ARM::AEK_SIMD, {Feature_HasV8Bit}, - {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} }, + {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} }, { ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} }, // FIXME: Only available in A-class, isel not predicated { ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} }, @@ -11775,19 +11808,19 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, // immediate in the syntax. switch (Kind) { default: break; - case MCK__35_0: + case MCK__HASH_0: if (Op.isImm()) if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 0) return Match_Success; break; - case MCK__35_8: + case MCK__HASH_8: if (Op.isImm()) if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 8) return Match_Success; break; - case MCK__35_16: + case MCK__HASH_16: if (Op.isImm()) if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 16) diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 673691ebd93e..eabc26d05f47 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -314,7 +314,7 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val, +static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder); @@ -561,6 +561,8 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -3445,7 +3447,7 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn, } static DecodeStatus -DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn, +DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -5679,7 +5681,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, } } } - return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); + return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder); } if (!(imm & 0x20)) return MCDisassembler::Fail; @@ -5738,7 +5740,7 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, } } } - return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); + return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder); } if (!(imm & 0x20)) return MCDisassembler::Fail; @@ -6481,6 +6483,12 @@ static DecodeStatus DecodeMVEOverlappingLongShift( if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; + if (fieldFromInstruction (Insn, 6, 3) != 4) + return MCDisassembler::SoftFail; + + if (Rda == Rm) + return MCDisassembler::SoftFail; + return S; } @@ -6503,6 +6511,13 @@ static DecodeStatus DecodeMVEOverlappingLongShift( if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; + if (Inst.getOpcode() == ARM::MVE_SQRSHRL || + Inst.getOpcode() == ARM::MVE_UQRSHLL) { + unsigned Saturate = fieldFromInstruction(Insn, 7, 1); + // Saturate, the bit position for saturation + Inst.addOperand(MCOperand::createImm(Saturate)); + } + return S; } @@ -6572,3 +6587,11 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, return MCDisassembler::Fail; return S; } + +static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + return S; +} diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h index 7732a6485a85..24a9fabf0979 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -518,10 +518,10 @@ namespace ARM_AM { // Valid alignments depend on the specific instruction. //===--------------------------------------------------------------------===// - // NEON Modified Immediates + // NEON/MVE Modified Immediates //===--------------------------------------------------------------------===// // - // Several NEON instructions (e.g., VMOV) take a "modified immediate" + // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate" // vector operand, where a small immediate encoded in the instruction // specifies a full NEON vector value. These modified immediates are // represented here as encoded integers. The low 8 bits hold the immediate @@ -529,20 +529,20 @@ namespace ARM_AM { // the "Cmode" field of the instruction. The interfaces below treat the // Op and Cmode values as a single 5-bit value. - inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) { + inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) { return (OpCmode << 8) | Val; } - inline unsigned getNEONModImmOpCmode(unsigned ModImm) { + inline unsigned getVMOVModImmOpCmode(unsigned ModImm) { return (ModImm >> 8) & 0x1f; } - inline unsigned getNEONModImmVal(unsigned ModImm) { return ModImm & 0xff; } + inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; } - /// decodeNEONModImm - Decode a NEON modified immediate value into the + /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the /// element value and the element size in bits. (If the element size is /// smaller than the vector, it is splatted into all the elements.) - inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) { - unsigned OpCmode = getNEONModImmOpCmode(ModImm); - unsigned Imm8 = getNEONModImmVal(ModImm); + inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) { + unsigned OpCmode = getVMOVModImmOpCmode(ModImm); + unsigned Imm8 = getVMOVModImmVal(ModImm); uint64_t Val = 0; if (OpCmode == 0xe) { @@ -572,7 +572,7 @@ namespace ARM_AM { } EltBits = 64; } else { - llvm_unreachable("Unsupported NEON immediate"); + llvm_unreachable("Unsupported VMOV immediate"); } return Val; } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index aeab5be78ab4..6196881a9b8f 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -233,7 +233,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) { const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, uint64_t Value) const { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case ARM::fixup_arm_thumb_br: { // Relaxing tB to t2B. tB has a signed 12-bit displacement with the // low bit being an implied zero. There's an implied +4 offset for the @@ -870,7 +870,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCValue &Target) { const MCSymbolRefExpr *A = Target.getSymA(); const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; - const unsigned FixupKind = Fixup.getKind() ; + const unsigned FixupKind = Fixup.getKind(); if (FixupKind == FK_NONE) return true; if (FixupKind == ARM::fixup_arm_thumb_bl) { @@ -1105,28 +1105,28 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( if (Instrs.empty()) return 0; // Start off assuming CFA is at SP+0. - int CFARegister = ARM::SP; + unsigned CFARegister = ARM::SP; int CFARegisterOffset = 0; // Mark savable registers as initially unsaved DenseMap<unsigned, int> RegOffsets; int FloatRegCount = 0; // Process each .cfi directive and build up compact unwind info. for (size_t i = 0, e = Instrs.size(); i != e; ++i) { - int Reg; + unsigned Reg; const MCCFIInstruction &Inst = Instrs[i]; switch (Inst.getOperation()) { case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa CFARegisterOffset = -Inst.getOffset(); - CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true); break; case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset CFARegisterOffset = -Inst.getOffset(); break; case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register - CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true); break; case MCCFIInstruction::OpOffset: // DW_CFA_offset - Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true); if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) RegOffsets[Reg] = Inst.getOffset(); else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index c4daafe8ee97..6293a2462306 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -393,6 +393,9 @@ namespace ARMII { // in an IT block). ThumbArithFlagSetting = 1 << 19, + // Whether an instruction can be included in an MVE tail-predicated loop. + ValidForTailPredication = 1 << 20, + //===------------------------------------------------------------------===// // Code domain. DomainShift = 15, diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index fda19eea1de6..1fee38821a49 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -82,7 +82,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; @@ -145,7 +145,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, return ELF::R_ARM_THM_BF18; } } - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; @@ -263,5 +263,5 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createARMELFObjectWriter(uint8_t OSABI) { - return llvm::make_unique<ARMELFObjectWriter>(OSABI); + return std::make_unique<ARMELFObjectWriter>(OSABI); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp index 45be1ee96342..a1def61b58d9 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp @@ -1334,12 +1334,12 @@ void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, << markup(">"); } -void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, +void ARMInstPrinter::printVMOVModImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned EncodedImm = MI->getOperand(OpNum).getImm(); unsigned EltBits; - uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + uint64_t Val = ARM_AM::decodeVMOVModImm(EncodedImm, EltBits); O << markup("<imm:") << "#0x"; O.write_hex(Val); O << markup(">"); @@ -1676,3 +1676,11 @@ void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum, O.write_hex(Val); O << markup(">"); } + +void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint32_t Val = MI->getOperand(OpNum).getImm(); + assert(Val <= 1 && "Invalid MVE saturate operand"); + O << "#" << (Val == 1 ? 48 : 64); +} diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h index 69026956b60e..eeb811e216fc 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h +++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h @@ -191,7 +191,7 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + void printVMOVModImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); @@ -262,7 +262,8 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printExpandedImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - + void printMveSaturateOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); private: unsigned DefaultAltIdx = ARM::NoRegAltName; }; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index dca6fe37d49a..268fe7efd9ce 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -1720,7 +1720,6 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op, unsigned Reg = MI.getOperand(Op).getReg(); bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg); bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg); - bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM; unsigned Binary = 0; @@ -1739,21 +1738,13 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op, Binary |= NumRegs * 2; } else { const MCRegisterInfo &MRI = *CTX.getRegisterInfo(); - if (!CLRMRegs) { - assert(std::is_sorted(MI.begin() + Op, MI.end(), - [&](const MCOperand &LHS, const MCOperand &RHS) { - return MRI.getEncodingValue(LHS.getReg()) < - MRI.getEncodingValue(RHS.getReg()); - })); - } - + assert(std::is_sorted(MI.begin() + Op, MI.end(), + [&](const MCOperand &LHS, const MCOperand &RHS) { + return MRI.getEncodingValue(LHS.getReg()) < + MRI.getEncodingValue(RHS.getReg()); + })); for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { - unsigned RegNo; - if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) { - RegNo = 15; - } else { - RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg()); - } + unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg()); Binary |= 1 << RegNo; } } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index c49885023cb2..ed4000c7e5be 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -204,7 +204,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, // relocation entry in the low 16 bits of r_address field. unsigned ThumbBit = 0; unsigned MovtBit = 0; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: break; case ARM::fixup_arm_movt_hi16: MovtBit = 1; @@ -480,7 +480,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, // PAIR. I.e. it's correct that we insert the high bits of the addend in the // MOVW case here. relocation entries. uint32_t Value = 0; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: break; case ARM::fixup_arm_movw_lo16: case ARM::fixup_t2_movw_lo16: @@ -506,5 +506,5 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, std::unique_ptr<MCObjectTargetWriter> llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) { - return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype); + return std::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index b863517c0cca..7b30a61e8ccb 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -249,12 +249,12 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) { : ARM::FK_VFPV3_D16) : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD))); - else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP)) + else if (STI.hasFeature(ARM::FeatureVFP2_SP)) emitFPU(ARM::FK_VFPV2); } // ABI_HardFP_use attribute to indicate single precision FP. - if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64)) + if (STI.hasFeature(ARM::FeatureVFP2_SP) && !STI.hasFeature(ARM::FeatureFP64)) emitAttribute(ARMBuildAttrs::ABI_HardFP_use, ARMBuildAttrs::HardFPSinglePrecision); diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 054a95dd1e12..900c5fe30364 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -92,7 +92,7 @@ namespace llvm { std::unique_ptr<MCObjectTargetWriter> createARMWinCOFFObjectWriter(bool Is64Bit) { - return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit); + return std::make_unique<ARMWinCOFFObjectWriter>(Is64Bit); } } // end namespace llvm diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index 2e816bea5e91..b3c8146a9bde 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -22,20 +22,10 @@ public: std::unique_ptr<MCObjectWriter> OW) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {} - void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitThumbFunc(MCSymbol *Symbol) override; void FinishImpl() override; }; -void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { - switch (Flag) { - default: llvm_unreachable("not implemented"); - case MCAF_SyntaxUnified: - case MCAF_Code16: - break; - } -} - void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { getAssembler().setIsThumbFunc(Symbol); } diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 4b25986b90a7..cc31929899b4 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -86,8 +86,8 @@ void MLxExpansion::pushStack(MachineInstr *MI) { MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { // Look past COPY and INSERT_SUBREG instructions to find the // real definition MI. This is important for _sfp instructions. - unsigned Reg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + Register Reg = MI->getOperand(1).getReg(); + if (Register::isPhysicalRegister(Reg)) return nullptr; MachineBasicBlock *MBB = MI->getParent(); @@ -97,13 +97,13 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { break; if (DefMI->isCopyLike()) { Reg = DefMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } } else if (DefMI->isInsertSubreg()) { Reg = DefMI->getOperand(2).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } @@ -114,9 +114,8 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { } unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { - unsigned Reg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || - !MRI->hasOneNonDBGUse(Reg)) + Register Reg = MI->getOperand(0).getReg(); + if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg)) return Reg; MachineBasicBlock *MBB = MI->getParent(); @@ -126,8 +125,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { while (UseMI->isCopy() || UseMI->isInsertSubreg()) { Reg = UseMI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || - !MRI->hasOneNonDBGUse(Reg)) + if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg)) return Reg; UseMI = &*MRI->use_instr_nodbg_begin(Reg); if (UseMI->getParent() != MBB) @@ -140,8 +138,8 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { /// hasLoopHazard - Check whether an MLx instruction is chained to itself across /// a single-MBB loop. bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const { - unsigned Reg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + Register Reg = MI->getOperand(1).getReg(); + if (Register::isPhysicalRegister(Reg)) return false; MachineBasicBlock *MBB = MI->getParent(); @@ -154,8 +152,8 @@ outer_continue: if (DefMI->isPHI()) { for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { if (DefMI->getOperand(i + 1).getMBB() == MBB) { - unsigned SrcReg = DefMI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register SrcReg = DefMI->getOperand(i).getReg(); + if (Register::isVirtualRegister(SrcReg)) { DefMI = MRI->getVRegDef(SrcReg); goto outer_continue; } @@ -163,13 +161,13 @@ outer_continue: } } else if (DefMI->isCopyLike()) { Reg = DefMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } } else if (DefMI->isInsertSubreg()) { Reg = DefMI->getOperand(2).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } @@ -271,23 +269,23 @@ void MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, unsigned MulOpc, unsigned AddSubOpc, bool NegAcc, bool HasLane) { - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); bool DstDead = MI->getOperand(0).isDead(); - unsigned AccReg = MI->getOperand(1).getReg(); - unsigned Src1Reg = MI->getOperand(2).getReg(); - unsigned Src2Reg = MI->getOperand(3).getReg(); + Register AccReg = MI->getOperand(1).getReg(); + Register Src1Reg = MI->getOperand(2).getReg(); + Register Src2Reg = MI->getOperand(3).getReg(); bool Src1Kill = MI->getOperand(2).isKill(); bool Src2Kill = MI->getOperand(3).isKill(); unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; unsigned NextOp = HasLane ? 5 : 4; ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); - unsigned PredReg = MI->getOperand(++NextOp).getReg(); + Register PredReg = MI->getOperand(++NextOp).getReg(); const MCInstrDesc &MCID1 = TII->get(MulOpc); const MCInstrDesc &MCID2 = TII->get(AddSubOpc); const MachineFunction &MF = *MI->getParent()->getParent(); - unsigned TmpReg = MRI->createVirtualRegister( - TII->getRegClass(MCID1, 0, TRI, MF)); + Register TmpReg = + MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI, MF)); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) .addReg(Src1Reg, getKillRegState(Src1Kill)) diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp new file mode 100644 index 000000000000..4db8ab17c49b --- /dev/null +++ b/lib/Target/ARM/MVETailPredication.cpp @@ -0,0 +1,519 @@ +//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead +/// branches to help accelerate DSP applications. These two extensions can be +/// combined to provide implicit vector predication within a low-overhead loop. +/// The HardwareLoops pass inserts intrinsics identifying loops that the +/// backend will attempt to convert into a low-overhead loop. The vectorizer is +/// responsible for generating a vectorized loop in which the lanes are +/// predicated upon the iteration counter. This pass looks at these predicated +/// vector loops, that are targets for low-overhead loops, and prepares it for +/// code generation. Once the vectorizer has produced a masked loop, there's a +/// couple of final forms: +/// - A tail-predicated loop, with implicit predication. +/// - A loop containing multiple VCPT instructions, predicating multiple VPT +/// blocks of instructions operating on different vector types. + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "ARM.h" +#include "ARMSubtarget.h" + +using namespace llvm; + +#define DEBUG_TYPE "mve-tail-predication" +#define DESC "Transform predicated vector loops to use MVE tail predication" + +static cl::opt<bool> +DisableTailPredication("disable-mve-tail-predication", cl::Hidden, + cl::init(true), + cl::desc("Disable MVE Tail Predication")); +namespace { + +class MVETailPredication : public LoopPass { + SmallVector<IntrinsicInst*, 4> MaskedInsts; + Loop *L = nullptr; + ScalarEvolution *SE = nullptr; + TargetTransformInfo *TTI = nullptr; + +public: + static char ID; + + MVETailPredication() : LoopPass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.setPreservesCFG(); + } + + bool runOnLoop(Loop *L, LPPassManager&) override; + +private: + + /// Perform the relevant checks on the loop and convert if possible. + bool TryConvert(Value *TripCount); + + /// Return whether this is a vectorized loop, that contains masked + /// load/stores. + bool IsPredicatedVectorLoop(); + + /// Compute a value for the total number of elements that the predicated + /// loop will process. + Value *ComputeElements(Value *TripCount, VectorType *VecTy); + + /// Is the icmp that generates an i1 vector, based upon a loop counter + /// and a limit that is defined outside the loop. + bool isTailPredicate(Instruction *Predicate, Value *NumElements); +}; + +} // end namespace + +static bool IsDecrement(Instruction &I) { + auto *Call = dyn_cast<IntrinsicInst>(&I); + if (!Call) + return false; + + Intrinsic::ID ID = Call->getIntrinsicID(); + return ID == Intrinsic::loop_decrement_reg; +} + +static bool IsMasked(Instruction *I) { + auto *Call = dyn_cast<IntrinsicInst>(I); + if (!Call) + return false; + + Intrinsic::ID ID = Call->getIntrinsicID(); + // TODO: Support gather/scatter expand/compress operations. + return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; +} + +bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { + if (skipLoop(L) || DisableTailPredication) + return false; + + Function &F = *L->getHeader()->getParent(); + auto &TPC = getAnalysis<TargetPassConfig>(); + auto &TM = TPC.getTM<TargetMachine>(); + auto *ST = &TM.getSubtarget<ARMSubtarget>(F); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + this->L = L; + + // The MVE and LOB extensions are combined to enable tail-predication, but + // there's nothing preventing us from generating VCTP instructions for v8.1m. + if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) { + LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n"); + return false; + } + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* { + for (auto &I : *BB) { + auto *Call = dyn_cast<IntrinsicInst>(&I); + if (!Call) + continue; + + Intrinsic::ID ID = Call->getIntrinsicID(); + if (ID == Intrinsic::set_loop_iterations || + ID == Intrinsic::test_set_loop_iterations) + return cast<IntrinsicInst>(&I); + } + return nullptr; + }; + + // Look for the hardware loop intrinsic that sets the iteration count. + IntrinsicInst *Setup = FindLoopIterations(Preheader); + + // The test.set iteration could live in the pre- preheader. + if (!Setup) { + if (!Preheader->getSinglePredecessor()) + return false; + Setup = FindLoopIterations(Preheader->getSinglePredecessor()); + if (!Setup) + return false; + } + + // Search for the hardware loop intrinic that decrements the loop counter. + IntrinsicInst *Decrement = nullptr; + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (IsDecrement(I)) { + Decrement = cast<IntrinsicInst>(&I); + break; + } + } + } + + if (!Decrement) + return false; + + LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L + << *Setup << "\n" + << *Decrement << "\n"); + bool Changed = TryConvert(Setup->getArgOperand(0)); + return Changed; +} + +bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { + // Look for the following: + + // %trip.count.minus.1 = add i32 %N, -1 + // %broadcast.splatinsert10 = insertelement <4 x i32> undef, + // i32 %trip.count.minus.1, i32 0 + // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, + // <4 x i32> undef, + // <4 x i32> zeroinitializer + // ... + // ... + // %index = phi i32 + // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, + // <4 x i32> undef, + // <4 x i32> zeroinitializer + // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> + // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 + + // And return whether V == %pred. + + using namespace PatternMatch; + + CmpInst::Predicate Pred; + Instruction *Shuffle = nullptr; + Instruction *Induction = nullptr; + + // The vector icmp + if (!match(I, m_ICmp(Pred, m_Instruction(Induction), + m_Instruction(Shuffle))) || + Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle)) + return false; + + // First find the stuff outside the loop which is setting up the limit + // vector.... + // The invariant shuffle that broadcast the limit into a vector. + Instruction *Insert = nullptr; + if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(), + m_Zero()))) + return false; + + // Insert the limit into a vector. + Instruction *BECount = nullptr; + if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount), + m_Zero()))) + return false; + + // The limit calculation, backedge count. + Value *TripCount = nullptr; + if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) + return false; + + if (TripCount != NumElements) + return false; + + // Now back to searching inside the loop body... + // Find the add with takes the index iv and adds a constant vector to it. + Instruction *BroadcastSplat = nullptr; + Constant *Const = nullptr; + if (!match(Induction, m_Add(m_Instruction(BroadcastSplat), + m_Constant(Const)))) + return false; + + // Check that we're adding <0, 1, 2, 3... + if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) { + for (unsigned i = 0; i < CDS->getNumElements(); ++i) { + if (CDS->getElementAsInteger(i) != i) + return false; + } + } else + return false; + + // The shuffle which broadcasts the index iv into a vector. + if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(), + m_Zero()))) + return false; + + // The insert element which initialises a vector with the index iv. + Instruction *IV = nullptr; + if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero()))) + return false; + + // The index iv. + auto *Phi = dyn_cast<PHINode>(IV); + if (!Phi) + return false; + + // TODO: Don't think we need to check the entry value. + Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); + if (!match(OnEntry, m_Zero())) + return false; + + Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); + unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements(); + + Instruction *LHS = nullptr; + if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) + return false; + + return LHS == Phi; +} + +static VectorType* getVectorType(IntrinsicInst *I) { + unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; + auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType()); + return cast<VectorType>(PtrTy->getElementType()); +} + +bool MVETailPredication::IsPredicatedVectorLoop() { + // Check that the loop contains at least one masked load/store intrinsic. + // We only support 'normal' vector instructions - other than masked + // load/stores. + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (IsMasked(&I)) { + VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I)); + unsigned Lanes = VecTy->getNumElements(); + unsigned ElementWidth = VecTy->getScalarSizeInBits(); + // MVE vectors are 128-bit, but don't support 128 x i1. + // TODO: Can we support vectors larger than 128-bits? + unsigned MaxWidth = TTI->getRegisterBitWidth(true); + if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth) + return false; + MaskedInsts.push_back(cast<IntrinsicInst>(&I)); + } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) { + for (auto &U : Int->args()) { + if (isa<VectorType>(U->getType())) + return false; + } + } + } + } + + return !MaskedInsts.empty(); +} + +Value* MVETailPredication::ComputeElements(Value *TripCount, + VectorType *VecTy) { + const SCEV *TripCountSE = SE->getSCEV(TripCount); + ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()), + VecTy->getNumElements()); + + if (VF->equalsInt(1)) + return nullptr; + + // TODO: Support constant trip counts. + auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* { + if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) { + if (Const->getAPInt() != -VF->getValue()) + return nullptr; + } else + return nullptr; + return dyn_cast<SCEVMulExpr>(S->getOperand(1)); + }; + + auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* { + if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) { + if (Const->getValue() != VF) + return nullptr; + } else + return nullptr; + return dyn_cast<SCEVUDivExpr>(S->getOperand(1)); + }; + + auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* { + if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) { + if (Const->getValue() != VF) + return nullptr; + } else + return nullptr; + + if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) { + if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) { + if (Const->getAPInt() != (VF->getValue() - 1)) + return nullptr; + } else + return nullptr; + + return RoundUp->getOperand(1); + } + return nullptr; + }; + + // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to + // determine the numbers of elements instead? Looks like this is what is used + // for delinearization, but I'm not sure if it can be applied to the + // vectorized form - at least not without a bit more work than I feel + // comfortable with. + + // Search for Elems in the following SCEV: + // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw> + const SCEV *Elems = nullptr; + if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE)) + if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1))) + if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS())) + if (auto *Mul = VisitAdd(Add)) + if (auto *Div = VisitMul(Mul)) + if (auto *Res = VisitDiv(Div)) + Elems = Res; + + if (!Elems) + return nullptr; + + Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); + if (!isSafeToExpandAt(Elems, InsertPt, *SE)) + return nullptr; + + auto DL = L->getHeader()->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "elements"); + return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); +} + +// Look through the exit block to see whether there's a duplicate predicate +// instruction. This can happen when we need to perform a select on values +// from the last and previous iteration. Instead of doing a straight +// replacement of that predicate with the vctp, clone the vctp and place it +// in the block. This means that the VPR doesn't have to be live into the +// exit block which should make it easier to convert this loop into a proper +// tail predicated loop. +static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, + SetVector<Instruction*> &MaybeDead, Loop *L) { + if (BasicBlock *Exit = L->getUniqueExitBlock()) { + for (auto &Pair : NewPredicates) { + Instruction *OldPred = Pair.first; + Instruction *NewPred = Pair.second; + + for (auto &I : *Exit) { + if (I.isSameOperationAs(OldPred)) { + Instruction *PredClone = NewPred->clone(); + PredClone->insertBefore(&I); + I.replaceAllUsesWith(PredClone); + MaybeDead.insert(&I); + break; + } + } + } + } + + // Drop references and add operands to check for dead. + SmallPtrSet<Instruction*, 4> Dead; + while (!MaybeDead.empty()) { + auto *I = MaybeDead.front(); + MaybeDead.remove(I); + if (I->hasNUsesOrMore(1)) + continue; + + for (auto &U : I->operands()) { + if (auto *OpI = dyn_cast<Instruction>(U)) + MaybeDead.insert(OpI); + } + I->dropAllReferences(); + Dead.insert(I); + } + + for (auto *I : Dead) + I->eraseFromParent(); + + for (auto I : L->blocks()) + DeleteDeadPHIs(I); +} + +bool MVETailPredication::TryConvert(Value *TripCount) { + if (!IsPredicatedVectorLoop()) + return false; + + LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n"); + + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated from an induction variable. + Module *M = L->getHeader()->getModule(); + Type *Ty = IntegerType::get(M->getContext(), 32); + SetVector<Instruction*> Predicates; + DenseMap<Instruction*, Instruction*> NewPredicates; + + for (auto *I : MaskedInsts) { + Intrinsic::ID ID = I->getIntrinsicID(); + unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp)); + if (!Predicate || Predicates.count(Predicate)) + continue; + + VectorType *VecTy = getVectorType(I); + Value *NumElements = ComputeElements(TripCount, VecTy); + if (!NumElements) + continue; + + if (!isTailPredicate(Predicate, NumElements)) { + LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n"); + continue; + } + + LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n"); + Predicates.insert(Predicate); + + // Insert a phi to count the number of elements processed by the loop. + IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + PHINode *Processed = Builder.CreatePHI(Ty, 2); + Processed->addIncoming(NumElements, L->getLoopPreheader()); + + // Insert the intrinsic to represent the effect of tail predication. + Builder.SetInsertPoint(cast<Instruction>(Predicate)); + ConstantInt *Factor = + ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements()); + Intrinsic::ID VCTPID; + switch (VecTy->getNumElements()) { + default: + llvm_unreachable("unexpected number of lanes"); + case 2: VCTPID = Intrinsic::arm_vctp64; break; + case 4: VCTPID = Intrinsic::arm_vctp32; break; + case 8: VCTPID = Intrinsic::arm_vctp16; break; + case 16: VCTPID = Intrinsic::arm_vctp8; break; + } + Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); + Value *TailPredicate = Builder.CreateCall(VCTP, Processed); + Predicate->replaceAllUsesWith(TailPredicate); + NewPredicates[Predicate] = cast<Instruction>(TailPredicate); + + // Add the incoming value to the new phi. + // TODO: This add likely already exists in the loop. + Value *Remaining = Builder.CreateSub(Processed, Factor); + Processed->addIncoming(Remaining, L->getLoopLatch()); + LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: " + << *Processed << "\n" + << "TP: Inserted VCTP: " << *TailPredicate << "\n"); + } + + // Now clean up. + Cleanup(NewPredicates, Predicates, L); + return true; +} + +Pass *llvm::createMVETailPredicationPass() { + return new MVETailPredication(); +} + +char MVETailPredication::ID = 0; + +INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false) +INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false) diff --git a/lib/Target/ARM/MVEVPTBlockPass.cpp b/lib/Target/ARM/MVEVPTBlockPass.cpp new file mode 100644 index 000000000000..bc0a80b177ed --- /dev/null +++ b/lib/Target/ARM/MVEVPTBlockPass.cpp @@ -0,0 +1,278 @@ +//===-- MVEVPTBlockPass.cpp - Insert MVE VPT blocks -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include <cassert> +#include <new> + +using namespace llvm; + +#define DEBUG_TYPE "arm-mve-vpt" + +namespace { + class MVEVPTBlock : public MachineFunctionPass { + public: + static char ID; + const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; + + MVEVPTBlock() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return "MVE VPT block insertion pass"; + } + + private: + bool InsertVPTBlocks(MachineBasicBlock &MBB); + }; + + char MVEVPTBlock::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) + +enum VPTMaskValue { + T = 8, // 0b1000 + TT = 4, // 0b0100 + TE = 12, // 0b1100 + TTT = 2, // 0b0010 + TTE = 6, // 0b0110 + TEE = 10, // 0b1010 + TET = 14, // 0b1110 + TTTT = 1, // 0b0001 + TTTE = 3, // 0b0011 + TTEE = 5, // 0b0101 + TTET = 7, // 0b0111 + TEEE = 9, // 0b1001 + TEET = 11, // 0b1011 + TETT = 13, // 0b1101 + TETE = 15 // 0b1111 +}; + +static unsigned VCMPOpcodeToVPT(unsigned Opcode) { + switch (Opcode) { + case ARM::MVE_VCMPf32: + return ARM::MVE_VPTv4f32; + case ARM::MVE_VCMPf16: + return ARM::MVE_VPTv8f16; + case ARM::MVE_VCMPi8: + return ARM::MVE_VPTv16i8; + case ARM::MVE_VCMPi16: + return ARM::MVE_VPTv8i16; + case ARM::MVE_VCMPi32: + return ARM::MVE_VPTv4i32; + case ARM::MVE_VCMPu8: + return ARM::MVE_VPTv16u8; + case ARM::MVE_VCMPu16: + return ARM::MVE_VPTv8u16; + case ARM::MVE_VCMPu32: + return ARM::MVE_VPTv4u32; + case ARM::MVE_VCMPs8: + return ARM::MVE_VPTv16s8; + case ARM::MVE_VCMPs16: + return ARM::MVE_VPTv8s16; + case ARM::MVE_VCMPs32: + return ARM::MVE_VPTv4s32; + + case ARM::MVE_VCMPf32r: + return ARM::MVE_VPTv4f32r; + case ARM::MVE_VCMPf16r: + return ARM::MVE_VPTv8f16r; + case ARM::MVE_VCMPi8r: + return ARM::MVE_VPTv16i8r; + case ARM::MVE_VCMPi16r: + return ARM::MVE_VPTv8i16r; + case ARM::MVE_VCMPi32r: + return ARM::MVE_VPTv4i32r; + case ARM::MVE_VCMPu8r: + return ARM::MVE_VPTv16u8r; + case ARM::MVE_VCMPu16r: + return ARM::MVE_VPTv8u16r; + case ARM::MVE_VCMPu32r: + return ARM::MVE_VPTv4u32r; + case ARM::MVE_VCMPs8r: + return ARM::MVE_VPTv16s8r; + case ARM::MVE_VCMPs16r: + return ARM::MVE_VPTv8s16r; + case ARM::MVE_VCMPs32r: + return ARM::MVE_VPTv4s32r; + + default: + return 0; + } +} + +static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, + unsigned &NewOpcode) { + // Search backwards to the instruction that defines VPR. This may or not + // be a VCMP, we check that after this loop. If we find another instruction + // that reads cpsr, we return nullptr. + MachineBasicBlock::iterator CmpMI = MI; + while (CmpMI != MI->getParent()->begin()) { + --CmpMI; + if (CmpMI->modifiesRegister(ARM::VPR, TRI)) + break; + if (CmpMI->readsRegister(ARM::VPR, TRI)) + break; + } + + if (CmpMI == MI) + return nullptr; + NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode()); + if (NewOpcode == 0) + return nullptr; + + // Search forward from CmpMI to MI, checking if either register was def'd + if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI), + MI, TRI)) + return nullptr; + if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI), + MI, TRI)) + return nullptr; + return &*CmpMI; +} + +bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { + bool Modified = false; + MachineBasicBlock::instr_iterator MBIter = Block.instr_begin(); + MachineBasicBlock::instr_iterator EndIter = Block.instr_end(); + + while (MBIter != EndIter) { + MachineInstr *MI = &*MBIter; + unsigned PredReg = 0; + DebugLoc dl = MI->getDebugLoc(); + + ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg); + + // The idea of the predicate is that None, Then and Else are for use when + // handling assembly language: they correspond to the three possible + // suffixes "", "t" and "e" on the mnemonic. So when instructions are read + // from assembly source or disassembled from object code, you expect to see + // a mixture whenever there's a long VPT block. But in code generation, we + // hope we'll never generate an Else as input to this pass. + assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds"); + + if (Pred == ARMVCC::None) { + ++MBIter; + continue; + } + + LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump()); + int VPTInstCnt = 1; + ARMVCC::VPTCodes NextPred; + + // Look at subsequent instructions, checking if they can be in the same VPT + // block. + ++MBIter; + while (MBIter != EndIter && VPTInstCnt < 4) { + NextPred = getVPTInstrPredicate(*MBIter, PredReg); + assert(NextPred != ARMVCC::Else && + "VPT block pass does not expect Else preds"); + if (NextPred != Pred) + break; + LLVM_DEBUG(dbgs() << " adding : "; MBIter->dump()); + ++VPTInstCnt; + ++MBIter; + }; + + unsigned BlockMask = 0; + switch (VPTInstCnt) { + case 1: + BlockMask = VPTMaskValue::T; + break; + case 2: + BlockMask = VPTMaskValue::TT; + break; + case 3: + BlockMask = VPTMaskValue::TTT; + break; + case 4: + BlockMask = VPTMaskValue::TTTT; + break; + default: + llvm_unreachable("Unexpected number of instruction in a VPT block"); + }; + + // Search back for a VCMP that can be folded to create a VPT, or else create + // a VPST directly + MachineInstrBuilder MIBuilder; + unsigned NewOpcode; + MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode); + if (VCMP) { + LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump()); + MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode)); + MIBuilder.addImm(BlockMask); + MIBuilder.add(VCMP->getOperand(1)); + MIBuilder.add(VCMP->getOperand(2)); + MIBuilder.add(VCMP->getOperand(3)); + VCMP->eraseFromParent(); + } else { + MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST)); + MIBuilder.addImm(BlockMask); + } + + finalizeBundle( + Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter); + + Modified = true; + } + return Modified; +} + +bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = + static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + + if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) + return false; + + TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); + TRI = STI.getRegisterInfo(); + + LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" + << "********** Function: " << Fn.getName() << '\n'); + + bool Modified = false; + for (MachineBasicBlock &MBB : Fn) + Modified |= InsertVPTBlocks(MBB); + + LLVM_DEBUG(dbgs() << "**************************************\n"); + return Modified; +} + +/// createMVEVPTBlock - Returns an instance of the MVE VPT block +/// insertion pass. +FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); } diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 426e9a0ed9b8..956d474f1d79 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -164,7 +164,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // to determine the end of the prologue. DebugLoc dl; - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); unsigned BasePtr = RegInfo->getBaseRegister(); int CFAOffset = 0; @@ -459,8 +459,8 @@ static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) { else if (MI.getOpcode() == ARM::tPOP) { return true; } else if (MI.getOpcode() == ARM::tMOVr) { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) && ARM::hGPRRegClass.contains(Dst)); } @@ -483,7 +483,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, assert((unsigned)NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index f57d93a2e83d..fccaa4c9cc8a 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -80,12 +80,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { assert((RC == &ARM::tGPRRegClass || - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - isARMLowRegister(SrcReg))) && "Unknown regclass!"); + (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) && + "Unknown regclass!"); if (RC == &ARM::tGPRRegClass || - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - isARMLowRegister(SrcReg))) { + (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); @@ -108,13 +107,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) || - (TargetRegisterInfo::isPhysicalRegister(DestReg) && - isARMLowRegister(DestReg))) && "Unknown regclass!"); + assert( + (RC->hasSuperClassEq(&ARM::tGPRRegClass) || + (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) && + "Unknown regclass!"); if (RC->hasSuperClassEq(&ARM::tGPRRegClass) || - (TargetRegisterInfo::isPhysicalRegister(DestReg) && - isARMLowRegister(DestReg))) { + (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp index 3143eb9840ed..786fc78d0233 100644 --- a/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -87,7 +87,7 @@ static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses, for (auto &MO : MI->operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP) continue; if (MO.isUse()) @@ -145,8 +145,8 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI, MI->getOperand(1).getSubReg() == 0 && "Sub-register indices still around?"); - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); // First check if it's safe to move it. if (Uses.count(DstReg) || Defs.count(SrcReg)) @@ -308,131 +308,3 @@ bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) { /// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks /// insertion pass. FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); } - -#undef DEBUG_TYPE -#define DEBUG_TYPE "arm-mve-vpt" - -namespace { - class MVEVPTBlock : public MachineFunctionPass { - public: - static char ID; - const Thumb2InstrInfo *TII; - const TargetRegisterInfo *TRI; - - MVEVPTBlock() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &Fn) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - - StringRef getPassName() const override { - return "MVE VPT block insertion pass"; - } - - private: - bool InsertVPTBlocks(MachineBasicBlock &MBB); - }; - - char MVEVPTBlock::ID = 0; - -} // end anonymous namespace - -INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) - -enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 -}; - -bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { - bool Modified = false; - MachineBasicBlock::iterator MBIter = Block.begin(); - MachineBasicBlock::iterator EndIter = Block.end(); - - while (MBIter != EndIter) { - MachineInstr *MI = &*MBIter; - unsigned PredReg = 0; - DebugLoc dl = MI->getDebugLoc(); - - ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg); - - // The idea of the predicate is that None, Then and Else are for use when - // handling assembly language: they correspond to the three possible - // suffixes "", "t" and "e" on the mnemonic. So when instructions are read - // from assembly source or disassembled from object code, you expect to see - // a mixture whenever there's a long VPT block. But in code generation, we - // hope we'll never generate an Else as input to this pass. - - assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds"); - - if (Pred == ARMVCC::None) { - ++MBIter; - continue; - } - - MachineInstrBuilder MIBuilder = - BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST)); - // The mask value for the VPST instruction is T = 0b1000 = 8 - MIBuilder.addImm(VPTMaskValue::T); - - MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr(); - int VPTInstCnt = 1; - ARMVCC::VPTCodes NextPred; - - do { - ++MBIter; - NextPred = getVPTInstrPredicate(*MBIter, PredReg); - } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4); - - MachineInstr *LastMI = &*MBIter; - finalizeBundle(Block, VPSTInsertPos.getInstrIterator(), - ++LastMI->getIterator()); - - Modified = true; - LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump();); - - ++MBIter; - } - return Modified; -} - -bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { - const ARMSubtarget &STI = - static_cast<const ARMSubtarget &>(Fn.getSubtarget()); - - if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) - return false; - - TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); - TRI = STI.getRegisterInfo(); - - LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" - << "********** Function: " << Fn.getName() << '\n'); - - bool Modified = false; - for (MachineBasicBlock &MBB : Fn) - Modified |= InsertVPTBlocks(MBB); - - LLVM_DEBUG(dbgs() << "**************************************\n"); - return Modified; -} - -/// createMVEVPTBlock - Returns an instance of the MVE VPT block -/// insertion pass. -FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); } diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 5a965f7a6b9b..af1f0aeb27ba 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -159,9 +159,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for // gsub_0, but needs an extra constraint for gsub_1 (which could be sp // otherwise). - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { + if (Register::isVirtualRegister(SrcReg)) { MachineRegisterInfo *MRI = &MF.getRegInfo(); - MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass); + MRI->constrainRegClass(SrcReg, &ARM::GPRPairnospRegClass); } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); @@ -200,10 +200,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for // gsub_0, but needs an extra constraint for gsub_1 (which could be sp // otherwise). - if (TargetRegisterInfo::isVirtualRegister(DestReg)) { + if (Register::isVirtualRegister(DestReg)) { MachineRegisterInfo *MRI = &MF.getRegInfo(); - MRI->constrainRegClass(DestReg, - &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass); + MRI->constrainRegClass(DestReg, &ARM::GPRPairnospRegClass); } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); @@ -211,7 +210,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL)); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); return; } @@ -470,12 +469,17 @@ immediateOffsetOpcode(unsigned opcode) bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII) { + const ARMBaseInstrInfo &TII, + const TargetRegisterInfo *TRI) { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MI.getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); bool isSub = false; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetRegisterClass *RegClass = + TII.getRegClass(Desc, FrameRegIdx, TRI, MF); + // Memory operands in inline assembly always use AddrModeT2_i12. if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? @@ -554,7 +558,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // register then we change to an immediate version. unsigned NewOpc = Opcode; if (AddrMode == ARMII::AddrModeT2_so) { - unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); + Register OffsetReg = MI.getOperand(FrameRegIdx + 1).getReg(); if (OffsetReg != 0) { MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); return Offset == 0; @@ -645,10 +649,21 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); // Attempt to fold address computation - // Common case: small offset, fits into instruction. + // Common case: small offset, fits into instruction. We need to make sure + // the register class is correct too, for instructions like the MVE + // VLDRH.32, which only accepts low tGPR registers. int ImmedOffset = Offset / Scale; unsigned Mask = (1 << NumBits) - 1; - if ((unsigned)Offset <= Mask * Scale) { + if ((unsigned)Offset <= Mask * Scale && + (Register::isVirtualRegister(FrameReg) || + RegClass->contains(FrameReg))) { + if (Register::isVirtualRegister(FrameReg)) { + // Make sure the register class for the virtual register is correct + MachineRegisterInfo *MRI = &MF.getRegInfo(); + if (!MRI->constrainRegClass(FrameReg, RegClass)) + llvm_unreachable("Unable to constrain virtual register class."); + } + // Replace the FrameIndex with fp/sp MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); if (isSub) { @@ -681,7 +696,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, } Offset = (isSub) ? -Offset : Offset; - return Offset == 0; + return Offset == 0 && (Register::isVirtualRegister(FrameReg) || + RegClass->contains(FrameReg)); } ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI, diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 37a85fa38417..c5a62aa33990 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -300,7 +300,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { for (const MachineOperand &MO : CPSRDef->operands()) { if (!MO.isReg() || MO.isUndef() || MO.isUse()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == 0 || Reg == ARM::CPSR) continue; Defs.insert(Reg); @@ -309,7 +309,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { for (const MachineOperand &MO : Use->operands()) { if (!MO.isReg() || MO.isUndef() || MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Defs.count(Reg)) return false; } @@ -380,7 +380,7 @@ static bool VerifyLowRegs(MachineInstr *MI) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || MO.isImplicit()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == 0 || Reg == ARM::CPSR) continue; if (isPCOk && Reg == ARM::PC) @@ -464,11 +464,11 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // For this reason we can't reuse the logic at the end of this function; we // have to implement the MI building here. bool IsStore = Entry.WideOpc == ARM::t2STR_POST; - unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg(); - unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg(); + Register Rt = MI->getOperand(IsStore ? 1 : 0).getReg(); + Register Rn = MI->getOperand(IsStore ? 0 : 1).getReg(); unsigned Offset = MI->getOperand(3).getImm(); unsigned PredImm = MI->getOperand(4).getImm(); - unsigned PredReg = MI->getOperand(5).getReg(); + Register PredReg = MI->getOperand(5).getReg(); assert(isARMLowRegister(Rt)); assert(isARMLowRegister(Rn)); @@ -496,7 +496,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, return true; } case ARM::t2LDMIA: { - unsigned BaseReg = MI->getOperand(0).getReg(); + Register BaseReg = MI->getOperand(0).getReg(); assert(isARMLowRegister(BaseReg)); // For the non-writeback version (this one), the base register must be @@ -524,7 +524,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, break; case ARM::t2LDMIA_RET: { - unsigned BaseReg = MI->getOperand(1).getReg(); + Register BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) return false; Opc = Entry.NarrowOpc2; // tPOP_RET @@ -537,7 +537,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, case ARM::t2STMDB_UPD: { OpNum = 0; - unsigned BaseReg = MI->getOperand(1).getReg(); + Register BaseReg = MI->getOperand(1).getReg(); if (BaseReg == ARM::SP && (Entry.WideOpc == ARM::t2LDMIA_UPD || Entry.WideOpc == ARM::t2STMDB_UPD)) { @@ -743,11 +743,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, // are optimizing for size. return false; - unsigned Reg0 = MI->getOperand(0).getReg(); - unsigned Reg1 = MI->getOperand(1).getReg(); + Register Reg0 = MI->getOperand(0).getReg(); + Register Reg1 = MI->getOperand(1).getReg(); // t2MUL is "special". The tied source operand is second, not first. if (MI->getOpcode() == ARM::t2MUL) { - unsigned Reg2 = MI->getOperand(2).getReg(); + Register Reg2 = MI->getOperand(2).getReg(); // Early exit if the regs aren't all low regs. if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1) || !isARMLowRegister(Reg2)) @@ -782,7 +782,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (Imm > Limit) return false; } else { - unsigned Reg2 = MI->getOperand(2).getReg(); + Register Reg2 = MI->getOperand(2).getReg(); if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) return false; } @@ -868,7 +868,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, continue; const MachineOperand &MO = MI->getOperand(i); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg || Reg == ARM::CPSR) continue; if (Entry.LowRegs1 && !isARMLowRegister(Reg)) diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp index a96417ffce4d..b0ba58d8dc4a 100644 --- a/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -107,8 +107,9 @@ void ThumbRegisterInfo::emitLoadConstPool( MachineFunction &MF = *MBB.getParent(); const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); if (STI.isThumb1Only()) { - assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) && - "Thumb1 does not have ldr to high register"); + assert( + (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) && + "Thumb1 does not have ldr to high register"); return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, PredReg, MIFlags); } @@ -141,7 +142,7 @@ static void emitThumbRegPlusImmInReg( unsigned LdReg = DestReg; if (DestReg == ARM::SP) assert(BaseReg == ARM::SP && "Unexpected!"); - if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg)) + if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg)) LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) { @@ -371,7 +372,7 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II, if (Opcode == ARM::tADDframe) { Offset += MI.getOperand(FrameRegIdx+1).getImm(); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII, *this); @@ -509,7 +510,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (MI.mayLoad()) { // Use the destination register to materialize sp + offset. - unsigned TmpReg = MI.getOperand(0).getReg(); + Register TmpReg = MI.getOperand(0).getReg(); bool UseRR = false; if (Opcode == ARM::tLDRspi) { if (FrameReg == ARM::SP || STI.genExecuteOnly()) diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp index 7586bd7b78fc..1db6b2236b4f 100644 --- a/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/lib/Target/AVR/AVRAsmPrinter.cpp @@ -97,7 +97,7 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, assert(RegOp.isReg() && "Operand must be a register when you're" "using 'A'..'Z' operand extracodes."); - unsigned Reg = RegOp.getReg(); + Register Reg = RegOp.getReg(); unsigned ByteNumber = ExtraCode[0] - 'A'; diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp index c45b2d0e39c1..83d0f6845332 100644 --- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -140,8 +140,8 @@ bool AVRExpandPseudo:: expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg; - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(2).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool DstIsKill = MI.getOperand(1).isKill(); bool SrcIsKill = MI.getOperand(2).isKill(); @@ -173,8 +173,8 @@ bool AVRExpandPseudo:: expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg; - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(2).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool DstIsKill = MI.getOperand(1).isKill(); bool SrcIsKill = MI.getOperand(2).isKill(); @@ -220,7 +220,7 @@ bool AVRExpandPseudo:: expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned DstLoReg, DstHiReg; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool SrcIsKill = MI.getOperand(1).isKill(); bool ImpIsDead = MI.getOperand(3).isDead(); @@ -874,7 +874,7 @@ unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) { // Exclude all the registers being used by the instruction. for (MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + !Register::isVirtualRegister(MO.getReg())) Candidates.reset(MO.getReg()); } diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp index 5e91bb8632c1..e6c48de5a782 100644 --- a/lib/Target/AVR/AVRFrameLowering.cpp +++ b/lib/Target/AVR/AVRFrameLowering.cpp @@ -30,7 +30,8 @@ namespace llvm { AVRFrameLowering::AVRFrameLowering() - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 1, -2) {} + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align::None(), + -2) {} bool AVRFrameLowering::canSimplifyCallFramePseudos( const MachineFunction &MF) const { @@ -323,7 +324,7 @@ static void fixStackStores(MachineBasicBlock &MBB, "Invalid register, should be SP!"); if (insertPushes) { // Replace this instruction with a push. - unsigned SrcReg = MI.getOperand(2).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); bool SrcIsKill = MI.getOperand(2).isKill(); // We can't use PUSHWRr here because when expanded the order of the new diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp index 5cb4441c4380..4c4f4faa0508 100644 --- a/lib/Target/AVR/AVRISelDAGToDAG.cpp +++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp @@ -251,7 +251,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op, RegisterSDNode *RegNode = cast<RegisterSDNode>(CopyFromRegOp->getOperand(1)); Reg = RegNode->getReg(); - CanHandleRegImmOpt &= (TargetRegisterInfo::isVirtualRegister(Reg) || + CanHandleRegImmOpt &= (Register::isVirtualRegister(Reg) || AVR::PTRDISPREGSRegClass.contains(Reg)); } else { CanHandleRegImmOpt = false; diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index b6ba5f22fafb..f12c59b7d8c3 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -236,7 +236,7 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM, setLibcallName(RTLIB::SIN_F32, "sin"); setLibcallName(RTLIB::COS_F32, "cos"); - setMinFunctionAlignment(1); + setMinFunctionAlignment(Align(2)); setMinimumJumpTableEntries(UINT_MAX); } @@ -1517,11 +1517,11 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass); unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass); - unsigned ShiftReg = RI.createVirtualRegister(RC); - unsigned ShiftReg2 = RI.createVirtualRegister(RC); - unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register ShiftReg = RI.createVirtualRegister(RC); + Register ShiftReg2 = RI.createVirtualRegister(RC); + Register ShiftAmtSrcReg = MI.getOperand(2).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); // BB: // cpi N, 0 @@ -1568,7 +1568,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, static bool isCopyMulResult(MachineBasicBlock::iterator const &I) { if (I->getOpcode() == AVR::COPY) { - unsigned SrcReg = I->getOperand(1).getReg(); + Register SrcReg = I->getOperand(1).getReg(); return (SrcReg == AVR::R0 || SrcReg == AVR::R1); } @@ -1689,6 +1689,8 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { // See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html switch (Constraint[0]) { + default: + break; case 'a': // Simple upper registers case 'b': // Base pointer registers pairs case 'd': // Upper register @@ -1715,9 +1717,7 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const { case 'O': // Integer constant (Range: 8, 16, 24) case 'P': // Integer constant (Range: 1) case 'R': // Integer constant (Range: -6 to 5)x - return C_Other; - default: - break; + return C_Immediate; } } @@ -2006,10 +2006,9 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -unsigned AVRTargetLowering::getRegisterByName(const char *RegName, - EVT VT, - SelectionDAG &DAG) const { - unsigned Reg; +Register AVRTargetLowering::getRegisterByName(const char *RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg; if (VT == MVT::i8) { Reg = StringSwitch<unsigned>(RegName) diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h index ed2d0835903c..6c722fa5414b 100644 --- a/lib/Target/AVR/AVRISelLowering.h +++ b/lib/Target/AVR/AVRISelLowering.h @@ -125,8 +125,8 @@ public: std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; bool shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL) const override { diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index a6b36f80485d..8fce05c933bc 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -158,7 +158,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // We need to materialize the offset via an add instruction. unsigned Opcode; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer"); II++; // Skip over the FRMIDX (and now MOVW) instruction. diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index a36c8b0f9649..25304280d002 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -50,7 +50,7 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), SubTarget(TT, getCPU(CPU), FS, *this) { - this->TLOF = make_unique<AVRTargetObjectFile>(); + this->TLOF = std::make_unique<AVRTargetObjectFile>(); initAsmInfo(); } diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index aac5644711e2..af60bc4fdc90 100644 --- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -199,22 +199,22 @@ public: } static std::unique_ptr<AVROperand> CreateToken(StringRef Str, SMLoc S) { - return make_unique<AVROperand>(Str, S); + return std::make_unique<AVROperand>(Str, S); } static std::unique_ptr<AVROperand> CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { - return make_unique<AVROperand>(RegNum, S, E); + return std::make_unique<AVROperand>(RegNum, S, E); } static std::unique_ptr<AVROperand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - return make_unique<AVROperand>(Val, S, E); + return std::make_unique<AVROperand>(Val, S, E); } static std::unique_ptr<AVROperand> CreateMemri(unsigned RegNum, const MCExpr *Val, SMLoc S, SMLoc E) { - return make_unique<AVROperand>(RegNum, Val, S, E); + return std::make_unique<AVROperand>(RegNum, Val, S, E); } void makeToken(StringRef Token) { diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp index 6025e4b2437c..1c69fea5962d 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp @@ -152,7 +152,7 @@ unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx, } std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI) { - return make_unique<AVRELFObjectWriter>(OSABI); + return std::make_unique<AVRELFObjectWriter>(OSABI); } } // end of namespace llvm diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index 75885fd058a7..ce1d2ecd9d26 100644 --- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -194,7 +194,7 @@ public: } static std::unique_ptr<BPFOperand> createToken(StringRef Str, SMLoc S) { - auto Op = make_unique<BPFOperand>(Token); + auto Op = std::make_unique<BPFOperand>(Token); Op->Tok = Str; Op->StartLoc = S; Op->EndLoc = S; @@ -203,7 +203,7 @@ public: static std::unique_ptr<BPFOperand> createReg(unsigned RegNo, SMLoc S, SMLoc E) { - auto Op = make_unique<BPFOperand>(Register); + auto Op = std::make_unique<BPFOperand>(Register); Op->Reg.RegNum = RegNo; Op->StartLoc = S; Op->EndLoc = E; @@ -212,7 +212,7 @@ public: static std::unique_ptr<BPFOperand> createImm(const MCExpr *Val, SMLoc S, SMLoc E) { - auto Op = make_unique<BPFOperand>(Immediate); + auto Op = std::make_unique<BPFOperand>(Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h index d311fc154094..6e4f35f4c5d7 100644 --- a/lib/Target/BPF/BPF.h +++ b/lib/Target/BPF/BPF.h @@ -15,17 +15,19 @@ namespace llvm { class BPFTargetMachine; -ModulePass *createBPFAbstractMemberAccess(); +ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM); FunctionPass *createBPFISelDag(BPFTargetMachine &TM); FunctionPass *createBPFMISimplifyPatchablePass(); FunctionPass *createBPFMIPeepholePass(); +FunctionPass *createBPFMIPeepholeTruncElimPass(); FunctionPass *createBPFMIPreEmitPeepholePass(); FunctionPass *createBPFMIPreEmitCheckingPass(); void initializeBPFAbstractMemberAccessPass(PassRegistry&); void initializeBPFMISimplifyPatchablePass(PassRegistry&); void initializeBPFMIPeepholePass(PassRegistry&); +void initializeBPFMIPeepholeTruncElimPass(PassRegistry&); void initializeBPFMIPreEmitPeepholePass(PassRegistry&); void initializeBPFMIPreEmitCheckingPass(PassRegistry&); } diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 51d4cbc8a429..400701c4e5c2 100644 --- a/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -50,6 +50,28 @@ // addr = preserve_struct_access_index(base, gep_index, di_index) // !llvm.preserve.access.index <struct_ditype> // +// Bitfield member access needs special attention. User cannot take the +// address of a bitfield acceess. To facilitate kernel verifier +// for easy bitfield code optimization, a new clang intrinsic is introduced: +// uint32_t __builtin_preserve_field_info(member_access, info_kind) +// In IR, a chain with two (or more) intrinsic calls will be generated: +// ... +// addr = preserve_struct_access_index(base, 1, 1) !struct s +// uint32_t result = bpf_preserve_field_info(addr, info_kind) +// +// Suppose the info_kind is FIELD_SIGNEDNESS, +// The above two IR intrinsics will be replaced with +// a relocatable insn: +// signness = /* signness of member_access */ +// and signness can be changed by bpf loader based on the +// types on the host. +// +// User can also test whether a field exists or not with +// uint32_t result = bpf_preserve_field_info(member_access, FIELD_EXISTENCE) +// The field will be always available (result = 1) during initial +// compilation, but bpf loader can patch with the correct value +// on the target host where the member_access may or may not be available +// //===----------------------------------------------------------------------===// #include "BPF.h" @@ -65,13 +87,12 @@ #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <stack> #define DEBUG_TYPE "bpf-abstract-member-access" namespace llvm { const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama"; -const std::string BPFCoreSharedInfo::PatchableExtSecName = - ".BPF.patchable_externs"; } // namespace llvm using namespace llvm; @@ -87,40 +108,62 @@ class BPFAbstractMemberAccess final : public ModulePass { public: static char ID; - BPFAbstractMemberAccess() : ModulePass(ID) {} + TargetMachine *TM; + // Add optional BPFTargetMachine parameter so that BPF backend can add the phase + // with target machine to find out the endianness. The default constructor (without + // parameters) is used by the pass manager for managing purposes. + BPFAbstractMemberAccess(BPFTargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) {} + + struct CallInfo { + uint32_t Kind; + uint32_t AccessIndex; + MDNode *Metadata; + Value *Base; + }; + typedef std::stack<std::pair<CallInst *, CallInfo>> CallInfoStack; private: enum : uint32_t { BPFPreserveArrayAI = 1, BPFPreserveUnionAI = 2, BPFPreserveStructAI = 3, + BPFPreserveFieldInfoAI = 4, }; std::map<std::string, GlobalVariable *> GEPGlobals; // A map to link preserve_*_access_index instrinsic calls. - std::map<CallInst *, std::pair<CallInst *, uint32_t>> AIChain; + std::map<CallInst *, std::pair<CallInst *, CallInfo>> AIChain; // A map to hold all the base preserve_*_access_index instrinsic calls. - // The base call is not an input of any other preserve_*_access_index + // The base call is not an input of any other preserve_* // intrinsics. - std::map<CallInst *, uint32_t> BaseAICalls; + std::map<CallInst *, CallInfo> BaseAICalls; bool doTransformation(Module &M); - void traceAICall(CallInst *Call, uint32_t Kind); - void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind); - void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind); + void traceAICall(CallInst *Call, CallInfo &ParentInfo); + void traceBitCast(BitCastInst *BitCast, CallInst *Parent, + CallInfo &ParentInfo); + void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, + CallInfo &ParentInfo); void collectAICallChains(Module &M, Function &F); - bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind); + bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo); + bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI, + const MDNode *ChildMeta); bool removePreserveAccessIndexIntrinsic(Module &M); void replaceWithGEP(std::vector<CallInst *> &CallList, uint32_t NumOfZerosIndex, uint32_t DIIndex); + bool HasPreserveFieldInfoCall(CallInfoStack &CallStack); + void GetStorageBitRange(DICompositeType *CTy, DIDerivedType *MemberTy, + uint32_t AccessIndex, uint32_t &StartBitOffset, + uint32_t &EndBitOffset); + uint32_t GetFieldInfo(uint32_t InfoKind, DICompositeType *CTy, + uint32_t AccessIndex, uint32_t PatchImm); - Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr, - std::string &AccessKey, uint32_t Kind, - MDNode *&TypeMeta); - bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex); - bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind); + Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo, + std::string &AccessKey, MDNode *&BaseMeta); + uint64_t getConstant(const Value *IndexValue); + bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo); }; } // End anonymous namespace @@ -128,23 +171,65 @@ char BPFAbstractMemberAccess::ID = 0; INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE, "abstracting struct/union member accessees", false, false) -ModulePass *llvm::createBPFAbstractMemberAccess() { - return new BPFAbstractMemberAccess(); +ModulePass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) { + return new BPFAbstractMemberAccess(TM); } bool BPFAbstractMemberAccess::runOnModule(Module &M) { LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n"); // Bail out if no debug info. - if (empty(M.debug_compile_units())) + if (M.debug_compile_units().empty()) return false; return doTransformation(M); } +static bool SkipDIDerivedTag(unsigned Tag) { + if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && + Tag != dwarf::DW_TAG_volatile_type && + Tag != dwarf::DW_TAG_restrict_type && + Tag != dwarf::DW_TAG_member) + return false; + return true; +} + +static DIType * stripQualifiers(DIType *Ty) { + while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { + if (!SkipDIDerivedTag(DTy->getTag())) + break; + Ty = DTy->getBaseType(); + } + return Ty; +} + +static const DIType * stripQualifiers(const DIType *Ty) { + while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { + if (!SkipDIDerivedTag(DTy->getTag())) + break; + Ty = DTy->getBaseType(); + } + return Ty; +} + +static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) { + DINodeArray Elements = CTy->getElements(); + uint32_t DimSize = 1; + for (uint32_t I = StartDim; I < Elements.size(); ++I) { + if (auto *Element = dyn_cast_or_null<DINode>(Elements[I])) + if (Element->getTag() == dwarf::DW_TAG_subrange_type) { + const DISubrange *SR = cast<DISubrange>(Element); + auto *CI = SR->getCount().dyn_cast<ConstantInt *>(); + DimSize *= CI->getSExtValue(); + } + } + + return DimSize; +} + /// Check whether a call is a preserve_*_access_index intrinsic call or not. bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, - uint32_t &Kind) { + CallInfo &CInfo) { if (!Call) return false; @@ -152,15 +237,40 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, if (!GV) return false; if (GV->getName().startswith("llvm.preserve.array.access.index")) { - Kind = BPFPreserveArrayAI; + CInfo.Kind = BPFPreserveArrayAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) + report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic"); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.union.access.index")) { - Kind = BPFPreserveUnionAI; + CInfo.Kind = BPFPreserveUnionAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) + report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic"); + CInfo.AccessIndex = getConstant(Call->getArgOperand(1)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.struct.access.index")) { - Kind = BPFPreserveStructAI; + CInfo.Kind = BPFPreserveStructAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) + report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic"); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); + return true; + } + if (GV->getName().startswith("llvm.bpf.preserve.field.info")) { + CInfo.Kind = BPFPreserveFieldInfoAI; + CInfo.Metadata = nullptr; + // Check validity of info_kind as clang did not check this. + uint64_t InfoKind = getConstant(Call->getArgOperand(1)); + if (InfoKind >= BPFCoreSharedInfo::MAX_FIELD_RELOC_KIND) + report_fatal_error("Incorrect info_kind for llvm.bpf.preserve.field.info intrinsic"); + CInfo.AccessIndex = InfoKind; return true; } @@ -173,8 +283,7 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList, for (auto Call : CallList) { uint32_t Dimension = 1; if (DimensionIndex > 0) - Dimension = cast<ConstantInt>(Call->getArgOperand(DimensionIndex)) - ->getZExtValue(); + Dimension = getConstant(Call->getArgOperand(DimensionIndex)); Constant *Zero = ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0); @@ -200,14 +309,14 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) { for (auto &BB : F) for (auto &I : BB) { auto *Call = dyn_cast<CallInst>(&I); - uint32_t Kind; - if (!IsPreserveDIAccessIndexCall(Call, Kind)) + CallInfo CInfo; + if (!IsPreserveDIAccessIndexCall(Call, CInfo)) continue; Found = true; - if (Kind == BPFPreserveArrayAI) + if (CInfo.Kind == BPFPreserveArrayAI) PreserveArrayIndexCalls.push_back(Call); - else if (Kind == BPFPreserveUnionAI) + else if (CInfo.Kind == BPFPreserveUnionAI) PreserveUnionIndexCalls.push_back(Call); else PreserveStructIndexCalls.push_back(Call); @@ -233,79 +342,146 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) { return Found; } -void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) { +/// Check whether the access index chain is valid. We check +/// here because there may be type casts between two +/// access indexes. We want to ensure memory access still valid. +bool BPFAbstractMemberAccess::IsValidAIChain(const MDNode *ParentType, + uint32_t ParentAI, + const MDNode *ChildType) { + if (!ChildType) + return true; // preserve_field_info, no type comparison needed. + + const DIType *PType = stripQualifiers(cast<DIType>(ParentType)); + const DIType *CType = stripQualifiers(cast<DIType>(ChildType)); + + // Child is a derived/pointer type, which is due to type casting. + // Pointer type cannot be in the middle of chain. + if (isa<DIDerivedType>(CType)) + return false; + + // Parent is a pointer type. + if (const auto *PtrTy = dyn_cast<DIDerivedType>(PType)) { + if (PtrTy->getTag() != dwarf::DW_TAG_pointer_type) + return false; + return stripQualifiers(PtrTy->getBaseType()) == CType; + } + + // Otherwise, struct/union/array types + const auto *PTy = dyn_cast<DICompositeType>(PType); + const auto *CTy = dyn_cast<DICompositeType>(CType); + assert(PTy && CTy && "ParentType or ChildType is null or not composite"); + + uint32_t PTyTag = PTy->getTag(); + assert(PTyTag == dwarf::DW_TAG_array_type || + PTyTag == dwarf::DW_TAG_structure_type || + PTyTag == dwarf::DW_TAG_union_type); + + uint32_t CTyTag = CTy->getTag(); + assert(CTyTag == dwarf::DW_TAG_array_type || + CTyTag == dwarf::DW_TAG_structure_type || + CTyTag == dwarf::DW_TAG_union_type); + + // Multi dimensional arrays, base element should be the same + if (PTyTag == dwarf::DW_TAG_array_type && PTyTag == CTyTag) + return PTy->getBaseType() == CTy->getBaseType(); + + DIType *Ty; + if (PTyTag == dwarf::DW_TAG_array_type) + Ty = PTy->getBaseType(); + else + Ty = dyn_cast<DIType>(PTy->getElements()[ParentAI]); + + return dyn_cast<DICompositeType>(stripQualifiers(Ty)) == CTy; +} + +void BPFAbstractMemberAccess::traceAICall(CallInst *Call, + CallInfo &ParentInfo) { for (User *U : Call->users()) { Instruction *Inst = dyn_cast<Instruction>(U); if (!Inst) continue; if (auto *BI = dyn_cast<BitCastInst>(Inst)) { - traceBitCast(BI, Call, Kind); + traceBitCast(BI, Call, ParentInfo); } else if (auto *CI = dyn_cast<CallInst>(Inst)) { - uint32_t CIKind; - if (IsPreserveDIAccessIndexCall(CI, CIKind)) { - AIChain[CI] = std::make_pair(Call, Kind); - traceAICall(CI, CIKind); + CallInfo ChildInfo; + + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Call, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Call, Kind); + traceGEP(GI, Call, ParentInfo); else - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; + } else { + BaseAICalls[Call] = ParentInfo; } } } void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast, - CallInst *Parent, uint32_t Kind) { + CallInst *Parent, + CallInfo &ParentInfo) { for (User *U : BitCast->users()) { Instruction *Inst = dyn_cast<Instruction>(U); if (!Inst) continue; if (auto *BI = dyn_cast<BitCastInst>(Inst)) { - traceBitCast(BI, Parent, Kind); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast<CallInst>(Inst)) { - uint32_t CIKind; - if (IsPreserveDIAccessIndexCall(CI, CIKind)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; + } else { + BaseAICalls[Parent] = ParentInfo; } } } void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent, - uint32_t Kind) { + CallInfo &ParentInfo) { for (User *U : GEP->users()) { Instruction *Inst = dyn_cast<Instruction>(U); if (!Inst) continue; if (auto *BI = dyn_cast<BitCastInst>(Inst)) { - traceBitCast(BI, Parent, Kind); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast<CallInst>(Inst)) { - uint32_t CIKind; - if (IsPreserveDIAccessIndexCall(CI, CIKind)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; + } else { + BaseAICalls[Parent] = ParentInfo; } } } @@ -316,92 +492,345 @@ void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) { for (auto &BB : F) for (auto &I : BB) { - uint32_t Kind; + CallInfo CInfo; auto *Call = dyn_cast<CallInst>(&I); - if (!IsPreserveDIAccessIndexCall(Call, Kind) || + if (!IsPreserveDIAccessIndexCall(Call, CInfo) || AIChain.find(Call) != AIChain.end()) continue; - traceAICall(Call, Kind); + traceAICall(Call, CInfo); } } -/// Get access index from the preserve_*_access_index intrinsic calls. -bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue, - uint64_t &AccessIndex) { +uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) { const ConstantInt *CV = dyn_cast<ConstantInt>(IndexValue); - if (!CV) - return false; + assert(CV); + return CV->getValue().getZExtValue(); +} - AccessIndex = CV->getValue().getZExtValue(); - return true; +/// Get the start and the end of storage offset for \p MemberTy. +/// The storage bits are corresponding to the LLVM internal types, +/// and the storage bits for the member determines what load width +/// to use in order to extract the bitfield value. +void BPFAbstractMemberAccess::GetStorageBitRange(DICompositeType *CTy, + DIDerivedType *MemberTy, + uint32_t AccessIndex, + uint32_t &StartBitOffset, + uint32_t &EndBitOffset) { + auto SOff = dyn_cast<ConstantInt>(MemberTy->getStorageOffsetInBits()); + assert(SOff); + StartBitOffset = SOff->getZExtValue(); + + EndBitOffset = CTy->getSizeInBits(); + uint32_t Index = AccessIndex + 1; + for (; Index < CTy->getElements().size(); ++Index) { + auto Member = cast<DIDerivedType>(CTy->getElements()[Index]); + if (!Member->getStorageOffsetInBits()) { + EndBitOffset = Member->getOffsetInBits(); + break; + } + SOff = dyn_cast<ConstantInt>(Member->getStorageOffsetInBits()); + assert(SOff); + unsigned BitOffset = SOff->getZExtValue(); + if (BitOffset != StartBitOffset) { + EndBitOffset = BitOffset; + break; + } + } } -/// Compute the base of the whole preserve_*_access_index chains, i.e., the base +uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind, + DICompositeType *CTy, + uint32_t AccessIndex, + uint32_t PatchImm) { + if (InfoKind == BPFCoreSharedInfo::FIELD_EXISTENCE) + return 1; + + uint32_t Tag = CTy->getTag(); + if (InfoKind == BPFCoreSharedInfo::FIELD_BYTE_OFFSET) { + if (Tag == dwarf::DW_TAG_array_type) { + auto *EltTy = stripQualifiers(CTy->getBaseType()); + PatchImm += AccessIndex * calcArraySize(CTy, 1) * + (EltTy->getSizeInBits() >> 3); + } else if (Tag == dwarf::DW_TAG_structure_type) { + auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]); + if (!MemberTy->isBitField()) { + PatchImm += MemberTy->getOffsetInBits() >> 3; + } else { + auto SOffset = dyn_cast<ConstantInt>(MemberTy->getStorageOffsetInBits()); + assert(SOffset); + PatchImm += SOffset->getZExtValue() >> 3; + } + } + return PatchImm; + } + + if (InfoKind == BPFCoreSharedInfo::FIELD_BYTE_SIZE) { + if (Tag == dwarf::DW_TAG_array_type) { + auto *EltTy = stripQualifiers(CTy->getBaseType()); + return calcArraySize(CTy, 1) * (EltTy->getSizeInBits() >> 3); + } else { + auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]); + uint32_t SizeInBits = MemberTy->getSizeInBits(); + if (!MemberTy->isBitField()) + return SizeInBits >> 3; + + unsigned SBitOffset, NextSBitOffset; + GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset); + SizeInBits = NextSBitOffset - SBitOffset; + if (SizeInBits & (SizeInBits - 1)) + report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info"); + return SizeInBits >> 3; + } + } + + if (InfoKind == BPFCoreSharedInfo::FIELD_SIGNEDNESS) { + const DIType *BaseTy; + if (Tag == dwarf::DW_TAG_array_type) { + // Signedness only checked when final array elements are accessed. + if (CTy->getElements().size() != 1) + report_fatal_error("Invalid array expression for llvm.bpf.preserve.field.info"); + BaseTy = stripQualifiers(CTy->getBaseType()); + } else { + auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]); + BaseTy = stripQualifiers(MemberTy->getBaseType()); + } + + // Only basic types and enum types have signedness. + const auto *BTy = dyn_cast<DIBasicType>(BaseTy); + while (!BTy) { + const auto *CompTy = dyn_cast<DICompositeType>(BaseTy); + // Report an error if the field expression does not have signedness. + if (!CompTy || CompTy->getTag() != dwarf::DW_TAG_enumeration_type) + report_fatal_error("Invalid field expression for llvm.bpf.preserve.field.info"); + BaseTy = stripQualifiers(CompTy->getBaseType()); + BTy = dyn_cast<DIBasicType>(BaseTy); + } + uint32_t Encoding = BTy->getEncoding(); + return (Encoding == dwarf::DW_ATE_signed || Encoding == dwarf::DW_ATE_signed_char); + } + + if (InfoKind == BPFCoreSharedInfo::FIELD_LSHIFT_U64) { + // The value is loaded into a value with FIELD_BYTE_SIZE size, + // and then zero or sign extended to U64. + // FIELD_LSHIFT_U64 and FIELD_RSHIFT_U64 are operations + // to extract the original value. + const Triple &Triple = TM->getTargetTriple(); + DIDerivedType *MemberTy = nullptr; + bool IsBitField = false; + uint32_t SizeInBits; + + if (Tag == dwarf::DW_TAG_array_type) { + auto *EltTy = stripQualifiers(CTy->getBaseType()); + SizeInBits = calcArraySize(CTy, 1) * EltTy->getSizeInBits(); + } else { + MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]); + SizeInBits = MemberTy->getSizeInBits(); + IsBitField = MemberTy->isBitField(); + } + + if (!IsBitField) { + if (SizeInBits > 64) + report_fatal_error("too big field size for llvm.bpf.preserve.field.info"); + return 64 - SizeInBits; + } + + unsigned SBitOffset, NextSBitOffset; + GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset); + if (NextSBitOffset - SBitOffset > 64) + report_fatal_error("too big field size for llvm.bpf.preserve.field.info"); + + unsigned OffsetInBits = MemberTy->getOffsetInBits(); + if (Triple.getArch() == Triple::bpfel) + return SBitOffset + 64 - OffsetInBits - SizeInBits; + else + return OffsetInBits + 64 - NextSBitOffset; + } + + if (InfoKind == BPFCoreSharedInfo::FIELD_RSHIFT_U64) { + DIDerivedType *MemberTy = nullptr; + bool IsBitField = false; + uint32_t SizeInBits; + if (Tag == dwarf::DW_TAG_array_type) { + auto *EltTy = stripQualifiers(CTy->getBaseType()); + SizeInBits = calcArraySize(CTy, 1) * EltTy->getSizeInBits(); + } else { + MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]); + SizeInBits = MemberTy->getSizeInBits(); + IsBitField = MemberTy->isBitField(); + } + + if (!IsBitField) { + if (SizeInBits > 64) + report_fatal_error("too big field size for llvm.bpf.preserve.field.info"); + return 64 - SizeInBits; + } + + unsigned SBitOffset, NextSBitOffset; + GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset); + if (NextSBitOffset - SBitOffset > 64) + report_fatal_error("too big field size for llvm.bpf.preserve.field.info"); + + return 64 - SizeInBits; + } + + llvm_unreachable("Unknown llvm.bpf.preserve.field.info info kind"); +} + +bool BPFAbstractMemberAccess::HasPreserveFieldInfoCall(CallInfoStack &CallStack) { + // This is called in error return path, no need to maintain CallStack. + while (CallStack.size()) { + auto StackElem = CallStack.top(); + if (StackElem.second.Kind == BPFPreserveFieldInfoAI) + return true; + CallStack.pop(); + } + return false; +} + +/// Compute the base of the whole preserve_* intrinsics chains, i.e., the base /// pointer of the first preserve_*_access_index call, and construct the access /// string, which will be the name of a global variable. -Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call, - std::string &AccessStr, +Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, + CallInfo &CInfo, std::string &AccessKey, - uint32_t Kind, MDNode *&TypeMeta) { Value *Base = nullptr; - std::vector<uint64_t> AccessIndices; - uint64_t TypeNameIndex = 0; - std::string LastTypeName; + std::string TypeName; + CallInfoStack CallStack; + // Put the access chain into a stack with the top as the head of the chain. while (Call) { - // Base of original corresponding GEP - Base = Call->getArgOperand(0); + CallStack.push(std::make_pair(Call, CInfo)); + CInfo = AIChain[Call].second; + Call = AIChain[Call].first; + } - // Type Name - std::string TypeName; - MDNode *MDN; - if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) { - MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!MDN) - return nullptr; + // The access offset from the base of the head of chain is also + // calculated here as all debuginfo types are available. - DIType *Ty = dyn_cast<DIType>(MDN); - if (!Ty) - return nullptr; + // Get type name and calculate the first index. + // We only want to get type name from structure or union. + // If user wants a relocation like + // int *p; ... __builtin_preserve_access_index(&p[4]) ... + // or + // int a[10][20]; ... __builtin_preserve_access_index(&a[2][3]) ... + // we will skip them. + uint32_t FirstIndex = 0; + uint32_t PatchImm = 0; // AccessOffset or the requested field info + uint32_t InfoKind = BPFCoreSharedInfo::FIELD_BYTE_OFFSET; + while (CallStack.size()) { + auto StackElem = CallStack.top(); + Call = StackElem.first; + CInfo = StackElem.second; + if (!Base) + Base = CInfo.Base; + + DIType *Ty = stripQualifiers(cast<DIType>(CInfo.Metadata)); + if (CInfo.Kind == BPFPreserveUnionAI || + CInfo.Kind == BPFPreserveStructAI) { + // struct or union type TypeName = Ty->getName(); + TypeMeta = Ty; + PatchImm += FirstIndex * (Ty->getSizeInBits() >> 3); + break; } - // Access Index - uint64_t AccessIndex; - uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2; - if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex)) - return nullptr; + assert(CInfo.Kind == BPFPreserveArrayAI); + + // Array entries will always be consumed for accumulative initial index. + CallStack.pop(); + + // BPFPreserveArrayAI + uint64_t AccessIndex = CInfo.AccessIndex; + + DIType *BaseTy = nullptr; + bool CheckElemType = false; + if (const auto *CTy = dyn_cast<DICompositeType>(Ty)) { + // array type + assert(CTy->getTag() == dwarf::DW_TAG_array_type); + - AccessIndices.push_back(AccessIndex); - if (TypeName.size()) { - TypeNameIndex = AccessIndices.size() - 1; - LastTypeName = TypeName; - TypeMeta = MDN; + FirstIndex += AccessIndex * calcArraySize(CTy, 1); + BaseTy = stripQualifiers(CTy->getBaseType()); + CheckElemType = CTy->getElements().size() == 1; + } else { + // pointer type + auto *DTy = cast<DIDerivedType>(Ty); + assert(DTy->getTag() == dwarf::DW_TAG_pointer_type); + + BaseTy = stripQualifiers(DTy->getBaseType()); + CTy = dyn_cast<DICompositeType>(BaseTy); + if (!CTy) { + CheckElemType = true; + } else if (CTy->getTag() != dwarf::DW_TAG_array_type) { + FirstIndex += AccessIndex; + CheckElemType = true; + } else { + FirstIndex += AccessIndex * calcArraySize(CTy, 0); + } } - Kind = AIChain[Call].second; - Call = AIChain[Call].first; + if (CheckElemType) { + auto *CTy = dyn_cast<DICompositeType>(BaseTy); + if (!CTy) { + if (HasPreserveFieldInfoCall(CallStack)) + report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic"); + return nullptr; + } + + unsigned CTag = CTy->getTag(); + if (CTag == dwarf::DW_TAG_structure_type || CTag == dwarf::DW_TAG_union_type) { + TypeName = CTy->getName(); + } else { + if (HasPreserveFieldInfoCall(CallStack)) + report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic"); + return nullptr; + } + TypeMeta = CTy; + PatchImm += FirstIndex * (CTy->getSizeInBits() >> 3); + break; + } } + assert(TypeName.size()); + AccessKey += std::to_string(FirstIndex); + + // Traverse the rest of access chain to complete offset calculation + // and access key construction. + while (CallStack.size()) { + auto StackElem = CallStack.top(); + CInfo = StackElem.second; + CallStack.pop(); - // The intial type name is required. - // FIXME: if the initial type access is an array index, e.g., - // &a[3].b.c, only one dimentional array is supported. - if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2) - return nullptr; + if (CInfo.Kind == BPFPreserveFieldInfoAI) + break; - // Construct the type string AccessStr. - for (unsigned I = 0; I < AccessIndices.size(); ++I) - AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr; + // If the next Call (the top of the stack) is a BPFPreserveFieldInfoAI, + // the action will be extracting field info. + if (CallStack.size()) { + auto StackElem2 = CallStack.top(); + CallInfo CInfo2 = StackElem2.second; + if (CInfo2.Kind == BPFPreserveFieldInfoAI) { + InfoKind = CInfo2.AccessIndex; + assert(CallStack.size() == 1); + } + } + + // Access Index + uint64_t AccessIndex = CInfo.AccessIndex; + AccessKey += ":" + std::to_string(AccessIndex); - if (TypeNameIndex == AccessIndices.size() - 1) - AccessStr = "0:" + AccessStr; + MDNode *MDN = CInfo.Metadata; + // At this stage, it cannot be pointer type. + auto *CTy = cast<DICompositeType>(stripQualifiers(cast<DIType>(MDN))); + PatchImm = GetFieldInfo(InfoKind, CTy, AccessIndex, PatchImm); + } - // Access key is the type name + access string, uniquely identifying - // one kernel memory access. - AccessKey = LastTypeName + ":" + AccessStr; + // Access key is the type name + reloc type + patched imm + access string, + // uniquely identifying one relocation. + AccessKey = TypeName + ":" + std::to_string(InfoKind) + ":" + + std::to_string(PatchImm) + "$" + AccessKey; return Base; } @@ -409,39 +838,52 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call, /// Call/Kind is the base preserve_*_access_index() call. Attempts to do /// transformation to a chain of relocable GEPs. bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call, - uint32_t Kind) { - std::string AccessStr, AccessKey; - MDNode *TypeMeta = nullptr; + CallInfo &CInfo) { + std::string AccessKey; + MDNode *TypeMeta; Value *Base = - computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta); + computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta); if (!Base) return false; - // Do the transformation - // For any original GEP Call and Base %2 like - // %4 = bitcast %struct.net_device** %dev1 to i64* - // it is transformed to: - // %6 = load __BTF_0:sk_buff:0:0:2:0: - // %7 = bitcast %struct.sk_buff* %2 to i8* - // %8 = getelementptr i8, i8* %7, %6 - // %9 = bitcast i8* %8 to i64* - // using %9 instead of %4 - // The original Call inst is removed. BasicBlock *BB = Call->getParent(); GlobalVariable *GV; if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) { - GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false, - GlobalVariable::ExternalLinkage, NULL, AccessStr); + IntegerType *VarType; + if (CInfo.Kind == BPFPreserveFieldInfoAI) + VarType = Type::getInt32Ty(BB->getContext()); // 32bit return value + else + VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr arith + + GV = new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage, + NULL, AccessKey); GV->addAttribute(BPFCoreSharedInfo::AmaAttr); - // Set the metadata (debuginfo types) for the global. - if (TypeMeta) - GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta); + GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta); GEPGlobals[AccessKey] = GV; } else { GV = GEPGlobals[AccessKey]; } + if (CInfo.Kind == BPFPreserveFieldInfoAI) { + // Load the global variable which represents the returned field info. + auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV); + BB->getInstList().insert(Call->getIterator(), LDInst); + Call->replaceAllUsesWith(LDInst); + Call->eraseFromParent(); + return true; + } + + // For any original GEP Call and Base %2 like + // %4 = bitcast %struct.net_device** %dev1 to i64* + // it is transformed to: + // %6 = load sk_buff:50:$0:0:0:2:0 + // %7 = bitcast %struct.sk_buff* %2 to i8* + // %8 = getelementptr i8, i8* %7, %6 + // %9 = bitcast i8* %8 to i64* + // using %9 instead of %4 + // The original Call inst is removed. + // Load the global variable. auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV); BB->getInstList().insert(Call->getIterator(), LDInst); diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp index e61e73468057..218b0302927c 100644 --- a/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/lib/Target/BPF/BPFAsmPrinter.cpp @@ -59,7 +59,7 @@ bool BPFAsmPrinter::doInitialization(Module &M) { AsmPrinter::doInitialization(M); // Only emit BTF when debuginfo available. - if (MAI->doesSupportDebugInformation() && !empty(M.debug_compile_units())) { + if (MAI->doesSupportDebugInformation() && !M.debug_compile_units().empty()) { BTF = new BTFDebug(this); Handlers.push_back(HandlerInfo(std::unique_ptr<BTFDebug>(BTF), "emit", "Debug Info Emission", "BTF", diff --git a/lib/Target/BPF/BPFCORE.h b/lib/Target/BPF/BPFCORE.h index e0950d95f8d7..ed4778353e52 100644 --- a/lib/Target/BPF/BPFCORE.h +++ b/lib/Target/BPF/BPFCORE.h @@ -13,10 +13,18 @@ namespace llvm { class BPFCoreSharedInfo { public: - /// The attribute attached to globals representing a member offset + enum OffsetRelocKind : uint32_t { + FIELD_BYTE_OFFSET = 0, + FIELD_BYTE_SIZE, + FIELD_EXISTENCE, + FIELD_SIGNEDNESS, + FIELD_LSHIFT_U64, + FIELD_RSHIFT_U64, + + MAX_FIELD_RELOC_KIND, + }; + /// The attribute attached to globals representing a field access static const std::string AmaAttr; - /// The section name to identify a patchable external global - static const std::string PatchableExtSecName; }; } // namespace llvm diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h index 2dc6277d2244..a546351ec6cb 100644 --- a/lib/Target/BPF/BPFFrameLowering.h +++ b/lib/Target/BPF/BPFFrameLowering.h @@ -21,7 +21,7 @@ class BPFSubtarget; class BPFFrameLowering : public TargetFrameLowering { public: explicit BPFFrameLowering(const BPFSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {} + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0) {} void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp index 1bd705c55188..f2be0ff070d2 100644 --- a/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -45,9 +45,7 @@ class BPFDAGToDAGISel : public SelectionDAGISel { public: explicit BPFDAGToDAGISel(BPFTargetMachine &TM) - : SelectionDAGISel(TM), Subtarget(nullptr) { - curr_func_ = nullptr; - } + : SelectionDAGISel(TM), Subtarget(nullptr) {} StringRef getPassName() const override { return "BPF DAG->DAG Pattern Instruction Selection"; @@ -92,14 +90,8 @@ private: val_vec_type &Vals, int Offset); bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset, uint64_t Size, unsigned char *ByteSeq); - bool checkLoadDef(unsigned DefReg, unsigned match_load_op); - // Mapping from ConstantStruct global value to corresponding byte-list values std::map<const void *, val_vec_type> cs_vals_; - // Mapping from vreg to load memory opcode - std::map<unsigned, unsigned> load_to_vreg_; - // Current function - const Function *curr_func_; }; } // namespace @@ -325,32 +317,13 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node, } void BPFDAGToDAGISel::PreprocessISelDAG() { - // Iterate through all nodes, interested in the following cases: + // Iterate through all nodes, interested in the following case: // // . loads from ConstantStruct or ConstantArray of constructs // which can be turns into constant itself, with this we can // avoid reading from read-only section at runtime. // - // . reg truncating is often the result of 8/16/32bit->64bit or - // 8/16bit->32bit conversion. If the reg value is loaded with - // masked byte width, the AND operation can be removed since - // BPF LOAD already has zero extension. - // - // This also solved a correctness issue. - // In BPF socket-related program, e.g., __sk_buff->{data, data_end} - // are 32-bit registers, but later on, kernel verifier will rewrite - // it with 64-bit value. Therefore, truncating the value after the - // load will result in incorrect code. - - // clear the load_to_vreg_ map so that we have a clean start - // for this function. - if (!curr_func_) { - curr_func_ = FuncInfo->Fn; - } else if (curr_func_ != FuncInfo->Fn) { - load_to_vreg_.clear(); - curr_func_ = FuncInfo->Fn; - } - + // . Removing redundant AND for intrinsic narrow loads. for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E;) { @@ -358,8 +331,6 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { unsigned Opcode = Node->getOpcode(); if (Opcode == ISD::LOAD) PreprocessLoad(Node, I); - else if (Opcode == ISD::CopyToReg) - PreprocessCopyToReg(Node); else if (Opcode == ISD::AND) PreprocessTrunc(Node, I); } @@ -491,37 +462,6 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL, return true; } -void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) { - const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1)); - if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg())) - return; - - const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2)); - if (!LD) - return; - - // Assign a load value to a virtual register. record its load width - unsigned mem_load_op = 0; - switch (LD->getMemOperand()->getSize()) { - default: - return; - case 4: - mem_load_op = BPF::LDW; - break; - case 2: - mem_load_op = BPF::LDH; - break; - case 1: - mem_load_op = BPF::LDB; - break; - } - - LLVM_DEBUG(dbgs() << "Find Load Value to VReg " - << TargetRegisterInfo::virtReg2Index(RegN->getReg()) - << '\n'); - load_to_vreg_[RegN->getReg()] = mem_load_op; -} - void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator &I) { ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1)); @@ -535,112 +475,26 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, // which the generic optimizer doesn't understand their results are // zero extended. SDValue BaseV = Node->getOperand(0); - if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) { - unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue(); - uint64_t MaskV = MaskN->getZExtValue(); - - if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) || - (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) || - (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF))) - return; - - LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; - Node->dump(); dbgs() << '\n'); - - I--; - CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); - I++; - CurDAG->DeleteNode(Node); - + if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN) return; - } - // Multiple basic blocks case. - if (BaseV.getOpcode() != ISD::CopyFromReg) - return; + unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue(); + uint64_t MaskV = MaskN->getZExtValue(); - unsigned match_load_op = 0; - switch (MaskN->getZExtValue()) { - default: + if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) || + (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) || + (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF))) return; - case 0xFFFFFFFF: - match_load_op = BPF::LDW; - break; - case 0xFFFF: - match_load_op = BPF::LDH; - break; - case 0xFF: - match_load_op = BPF::LDB; - break; - } - const RegisterSDNode *RegN = - dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1)); - if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg())) - return; - unsigned AndOpReg = RegN->getReg(); - LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n'); - - // Examine the PHI insns in the MachineBasicBlock to found out the - // definitions of this virtual register. At this stage (DAG2DAG - // transformation), only PHI machine insns are available in the machine basic - // block. - MachineBasicBlock *MBB = FuncInfo->MBB; - MachineInstr *MII = nullptr; - for (auto &MI : *MBB) { - for (unsigned i = 0; i < MI.getNumOperands(); ++i) { - const MachineOperand &MOP = MI.getOperand(i); - if (!MOP.isReg() || !MOP.isDef()) - continue; - unsigned Reg = MOP.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) { - MII = &MI; - break; - } - } - } - - if (MII == nullptr) { - // No phi definition in this block. - if (!checkLoadDef(AndOpReg, match_load_op)) - return; - } else { - // The PHI node looks like: - // %2 = PHI %0, <%bb.1>, %1, <%bb.3> - // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3) - // The AND operation can be removed if both %0 in %bb.1 and %1 in - // %bb.3 are defined with a load matching the MaskN. - LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n'); - unsigned PrevReg = -1; - for (unsigned i = 0; i < MII->getNumOperands(); ++i) { - const MachineOperand &MOP = MII->getOperand(i); - if (MOP.isReg()) { - if (MOP.isDef()) - continue; - PrevReg = MOP.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(PrevReg)) - return; - if (!checkLoadDef(PrevReg, match_load_op)) - return; - } - } - } - - LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump(); - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; + Node->dump(); dbgs() << '\n'); I--; CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); I++; CurDAG->DeleteNode(Node); -} - -bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) { - auto it = load_to_vreg_.find(DefReg); - if (it == load_to_vreg_.end()) - return false; // The definition of register is not exported yet. - return it->second == match_load_op; + return; } FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) { diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index ff69941d26fb..56e0288f26c9 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -132,9 +132,9 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, setBooleanContents(ZeroOrOneBooleanContent); - // Function alignments (log2) - setMinFunctionAlignment(3); - setPrefFunctionAlignment(3); + // Function alignments + setMinFunctionAlignment(Align(8)); + setPrefFunctionAlignment(Align(8)); if (BPFExpandMemcpyInOrder) { // LLVM generic code will try to expand memcpy into load/store pairs at this @@ -236,9 +236,8 @@ SDValue BPFTargetLowering::LowerFormalArguments( } case MVT::i32: case MVT::i64: - unsigned VReg = RegInfo.createVirtualRegister(SimpleTy == MVT::i64 ? - &BPF::GPRRegClass : - &BPF::GPR32RegClass); + Register VReg = RegInfo.createVirtualRegister( + SimpleTy == MVT::i64 ? &BPF::GPRRegClass : &BPF::GPR32RegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT); @@ -571,9 +570,9 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, DebugLoc DL = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned PromotedReg0 = RegInfo.createVirtualRegister(RC); - unsigned PromotedReg1 = RegInfo.createVirtualRegister(RC); - unsigned PromotedReg2 = RegInfo.createVirtualRegister(RC); + Register PromotedReg0 = RegInfo.createVirtualRegister(RC); + Register PromotedReg1 = RegInfo.createVirtualRegister(RC); + Register PromotedReg2 = RegInfo.createVirtualRegister(RC); BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg); BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1) .addReg(PromotedReg0).addImm(32); @@ -699,7 +698,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, report_fatal_error("unimplemented select CondCode " + Twine(CC)); } - unsigned LHS = MI.getOperand(1).getReg(); + Register LHS = MI.getOperand(1).getReg(); bool isSignedCmp = (CC == ISD::SETGT || CC == ISD::SETGE || CC == ISD::SETLT || @@ -716,7 +715,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp); if (isSelectRROp) { - unsigned RHS = MI.getOperand(2).getReg(); + Register RHS = MI.getOperand(2).getReg(); if (is32BitCmp && !HasJmp32) RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp); diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp index 932f718d5490..6de3a4084d3d 100644 --- a/lib/Target/BPF/BPFInstrInfo.cpp +++ b/lib/Target/BPF/BPFInstrInfo.cpp @@ -43,11 +43,11 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); uint64_t CopyLen = MI->getOperand(2).getImm(); uint64_t Alignment = MI->getOperand(3).getImm(); - unsigned ScratchReg = MI->getOperand(4).getReg(); + Register ScratchReg = MI->getOperand(4).getReg(); MachineBasicBlock *BB = MI->getParent(); DebugLoc dl = MI->getDebugLoc(); unsigned LdOpc, StOpc; diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index c44702a78ec8..ae5a82a99303 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -473,7 +473,7 @@ class CALL<string OpcodeStr> class CALLX<string OpcodeStr> : TYPE_ALU_JMP<BPF_CALL.Value, BPF_X.Value, (outs), - (ins calltarget:$BrDst), + (ins GPR:$BrDst), !strconcat(OpcodeStr, " $BrDst"), []> { bits<32> BrDst; diff --git a/lib/Target/BPF/BPFMIChecking.cpp b/lib/Target/BPF/BPFMIChecking.cpp index 4c46289656b4..f82f166eda4d 100644 --- a/lib/Target/BPF/BPFMIChecking.cpp +++ b/lib/Target/BPF/BPFMIChecking.cpp @@ -19,6 +19,7 @@ #include "BPFTargetMachine.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" using namespace llvm; diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp index 156ba793e359..e9eecc55c3c3 100644 --- a/lib/Target/BPF/BPFMIPeephole.cpp +++ b/lib/Target/BPF/BPFMIPeephole.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -71,7 +72,7 @@ void BPFMIPeephole::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo(); - LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n"); + LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n"); } bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI) @@ -104,10 +105,10 @@ bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI) if (!opnd.isReg()) return false; - unsigned Reg = opnd.getReg(); - if ((TargetRegisterInfo::isVirtualRegister(Reg) && + Register Reg = opnd.getReg(); + if ((Register::isVirtualRegister(Reg) && MRI->getRegClass(Reg) == &BPF::GPRRegClass)) - return false; + return false; } LLVM_DEBUG(dbgs() << " One ZExt elim sequence identified.\n"); @@ -134,8 +135,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) { // SRL_ri rB, rB, 32 if (MI.getOpcode() == BPF::SRL_ri && MI.getOperand(2).getImm() == 32) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned ShfReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register ShfReg = MI.getOperand(1).getReg(); MachineInstr *SllMI = MRI->getVRegDef(ShfReg); LLVM_DEBUG(dbgs() << "Starting SRL found:"); @@ -159,7 +160,7 @@ bool BPFMIPeephole::eliminateZExtSeq(void) { LLVM_DEBUG(dbgs() << " Type cast Mov found:"); LLVM_DEBUG(MovMI->dump()); - unsigned SubReg = MovMI->getOperand(1).getReg(); + Register SubReg = MovMI->getOperand(1).getReg(); if (!isMovFrom32Def(MovMI)) { LLVM_DEBUG(dbgs() << " One ZExt elim sequence failed qualifying elim.\n"); @@ -186,7 +187,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) { } // end default namespace INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE, - "BPF MachineSSA Peephole Optimization", false, false) + "BPF MachineSSA Peephole Optimization For ZEXT Eliminate", + false, false) char BPFMIPeephole::ID = 0; FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); } @@ -253,12 +255,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) { // enabled. The special type cast insn MOV_32_64 involves different // register class on src (i32) and dst (i64), RA could generate useless // instruction due to this. - if (MI.getOpcode() == BPF::MOV_32_64) { - unsigned dst = MI.getOperand(0).getReg(); - unsigned dst_sub = TRI->getSubReg(dst, BPF::sub_32); - unsigned src = MI.getOperand(1).getReg(); + unsigned Opcode = MI.getOpcode(); + if (Opcode == BPF::MOV_32_64 || + Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) { + Register dst = MI.getOperand(0).getReg(); + Register src = MI.getOperand(1).getReg(); + + if (Opcode == BPF::MOV_32_64) + dst = TRI->getSubReg(dst, BPF::sub_32); - if (dst_sub != src) + if (dst != src) continue; ToErase = &MI; @@ -281,3 +287,177 @@ FunctionPass* llvm::createBPFMIPreEmitPeepholePass() { return new BPFMIPreEmitPeephole(); } + +STATISTIC(TruncElemNum, "Number of truncation eliminated"); + +namespace { + +struct BPFMIPeepholeTruncElim : public MachineFunctionPass { + + static char ID; + const BPFInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) { + initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + bool eliminateTruncSeq(void); + +public: + + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + initialize(MF); + + return eliminateTruncSeq(); + } +}; + +static bool TruncSizeCompatible(int TruncSize, unsigned opcode) +{ + if (TruncSize == 1) + return opcode == BPF::LDB || opcode == BPF::LDB32; + + if (TruncSize == 2) + return opcode == BPF::LDH || opcode == BPF::LDH32; + + if (TruncSize == 4) + return opcode == BPF::LDW || opcode == BPF::LDW32; + + return false; +} + +// Initialize class variables. +void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo(); + LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n"); +} + +// Reg truncating is often the result of 8/16/32bit->64bit or +// 8/16bit->32bit conversion. If the reg value is loaded with +// masked byte width, the AND operation can be removed since +// BPF LOAD already has zero extension. +// +// This also solved a correctness issue. +// In BPF socket-related program, e.g., __sk_buff->{data, data_end} +// are 32-bit registers, but later on, kernel verifier will rewrite +// it with 64-bit value. Therefore, truncating the value after the +// load will result in incorrect code. +bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) { + MachineInstr* ToErase = nullptr; + bool Eliminated = false; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + // The second insn to remove if the eliminate candidate is a pair. + MachineInstr *MI2 = nullptr; + Register DstReg, SrcReg; + MachineInstr *DefMI; + int TruncSize = -1; + + // If the previous instruction was marked for elimination, remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate + // for BPF ANDI is i32, and this case only happens on ALU64. + if (MI.getOpcode() == BPF::SRL_ri && + MI.getOperand(2).getImm() == 32) { + SrcReg = MI.getOperand(1).getReg(); + MI2 = MRI->getVRegDef(SrcReg); + DstReg = MI.getOperand(0).getReg(); + + if (!MI2 || + MI2->getOpcode() != BPF::SLL_ri || + MI2->getOperand(2).getImm() != 32) + continue; + + // Update SrcReg. + SrcReg = MI2->getOperand(1).getReg(); + DefMI = MRI->getVRegDef(SrcReg); + if (DefMI) + TruncSize = 4; + } else if (MI.getOpcode() == BPF::AND_ri || + MI.getOpcode() == BPF::AND_ri_32) { + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + DefMI = MRI->getVRegDef(SrcReg); + + if (!DefMI) + continue; + + int64_t imm = MI.getOperand(2).getImm(); + if (imm == 0xff) + TruncSize = 1; + else if (imm == 0xffff) + TruncSize = 2; + } + + if (TruncSize == -1) + continue; + + // The definition is PHI node, check all inputs. + if (DefMI->isPHI()) { + bool CheckFail = false; + + for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { + MachineOperand &opnd = DefMI->getOperand(i); + if (!opnd.isReg()) { + CheckFail = true; + break; + } + + MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg()); + if (!PhiDef || PhiDef->isPHI() || + !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) { + CheckFail = true; + break; + } + } + + if (CheckFail) + continue; + } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) { + continue; + } + + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg) + .addReg(SrcReg); + + if (MI2) + MI2->eraseFromParent(); + + // Mark it to ToErase, and erase in the next iteration. + ToErase = &MI; + TruncElemNum++; + Eliminated = true; + } + } + + return Eliminated; +} + +} // end default namespace + +INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim", + "BPF MachineSSA Peephole Optimization For TRUNC Eliminate", + false, false) + +char BPFMIPeepholeTruncElim::ID = 0; +FunctionPass* llvm::createBPFMIPeepholeTruncElimPass() +{ + return new BPFMIPeepholeTruncElim(); +} diff --git a/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/lib/Target/BPF/BPFMISimplifyPatchable.cpp index e9114d7187e3..9c689aed6417 100644 --- a/lib/Target/BPF/BPFMISimplifyPatchable.cpp +++ b/lib/Target/BPF/BPFMISimplifyPatchable.cpp @@ -11,19 +11,15 @@ // ldd r2, r1, 0 // add r3, struct_base_reg, r2 // -// Here @global should either present a AMA (abstruct member access) or -// a patchable extern variable. And these two kinds of accesses -// are subject to bpf load time patching. After this pass, the +// Here @global should represent an AMA (abstruct member access). +// Such an access is subject to bpf load time patching. After this pass, the // code becomes // ld_imm64 r1, @global // add r3, struct_base_reg, r1 // // Eventually, at BTF output stage, a relocation record will be generated // for ld_imm64 which should be replaced later by bpf loader: -// r1 = <calculated offset> or <to_be_patched_extern_val> -// add r3, struct_base_reg, r1 -// or -// ld_imm64 r1, <to_be_patched_extern_val> +// r1 = <calculated field_info> // add r3, struct_base_reg, r1 // //===----------------------------------------------------------------------===// @@ -34,6 +30,7 @@ #include "BPFTargetMachine.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -100,9 +97,8 @@ bool BPFMISimplifyPatchable::removeLD() { if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm()) continue; - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); - int64_t ImmVal = MI.getOperand(2).getImm(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); MachineInstr *DefInst = MRI->getUniqueVRegDef(SrcReg); if (!DefInst) @@ -118,17 +114,8 @@ bool BPFMISimplifyPatchable::removeLD() { // Global variables representing structure offset or // patchable extern globals. if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) { - assert(ImmVal == 0); + assert(MI.getOperand(2).getImm() == 0); IsCandidate = true; - } else if (!GVar->hasInitializer() && GVar->hasExternalLinkage() && - GVar->getSection() == - BPFCoreSharedInfo::PatchableExtSecName) { - if (ImmVal == 0) - IsCandidate = true; - else - errs() << "WARNING: unhandled patchable extern " - << GVar->getName() << " with load offset " << ImmVal - << "\n"; } } } diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp index 714af06e11d9..8de81a469b84 100644 --- a/lib/Target/BPF/BPFRegisterInfo.cpp +++ b/lib/Target/BPF/BPFRegisterInfo.cpp @@ -77,7 +77,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); } - unsigned FrameReg = getFrameRegister(MF); + Register FrameReg = getFrameRegister(MF); int FrameIndex = MI.getOperand(i).getIndex(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -86,7 +86,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, WarnSize(Offset, MF, DL); MI.getOperand(i).ChangeToRegister(FrameReg, false); - unsigned reg = MI.getOperand(i - 1).getReg(); + Register reg = MI.getOperand(i - 1).getReg(); BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg) .addReg(reg) .addImm(Offset); @@ -105,7 +105,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // architecture does not really support FI_ri, replace it with // MOV_rr <target_reg>, frame_reg // ADD_ri <target_reg>, imm - unsigned reg = MI.getOperand(i - 1).getReg(); + Register reg = MI.getOperand(i - 1).getReg(); BuildMI(MBB, ++II, DL, TII.get(BPF::MOV_rr), reg) .addReg(FrameReg); diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 24c0ff0f7f15..0c4f2c74e7a4 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -36,6 +36,7 @@ extern "C" void LLVMInitializeBPFTarget() { PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeBPFAbstractMemberAccessPass(PR); initializeBPFMIPeepholePass(PR); + initializeBPFMIPeepholeTruncElimPass(PR); } // DataLayout: little or big endian @@ -61,7 +62,7 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - TLOF(make_unique<TargetLoweringObjectFileELF>()), + TLOF(std::make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); @@ -94,7 +95,7 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { void BPFPassConfig::addIRPasses() { - addPass(createBPFAbstractMemberAccess()); + addPass(createBPFAbstractMemberAccess(&getBPFTargetMachine())); TargetPassConfig::addIRPasses(); } @@ -115,15 +116,16 @@ void BPFPassConfig::addMachineSSAOptimization() { TargetPassConfig::addMachineSSAOptimization(); const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl(); - if (Subtarget->getHasAlu32() && !DisableMIPeephole) - addPass(createBPFMIPeepholePass()); + if (!DisableMIPeephole) { + if (Subtarget->getHasAlu32()) + addPass(createBPFMIPeepholePass()); + addPass(createBPFMIPeepholeTruncElimPass()); + } } void BPFPassConfig::addPreEmitPass() { - const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl(); - addPass(createBPFMIPreEmitCheckingPass()); if (getOptLevel() != CodeGenOpt::None) - if (Subtarget->getHasAlu32() && !DisableMIPeephole) + if (!DisableMIPeephole) addPass(createBPFMIPreEmitPeepholePass()); } diff --git a/lib/Target/BPF/BTF.h b/lib/Target/BPF/BTF.h index ad56716710a6..a13c862bf840 100644 --- a/lib/Target/BPF/BTF.h +++ b/lib/Target/BPF/BTF.h @@ -17,7 +17,7 @@ /// /// The binary layout for .BTF.ext section: /// struct ExtHeader -/// FuncInfo, LineInfo, OffsetReloc and ExternReloc subsections +/// FuncInfo, LineInfo, FieldReloc and ExternReloc subsections /// The FuncInfo subsection is defined as below: /// BTFFuncInfo Size /// struct SecFuncInfo for ELF section #1 @@ -32,19 +32,12 @@ /// struct SecLineInfo for ELF section #2 /// A number of struct BPFLineInfo for ELF section #2 /// ... -/// The OffsetReloc subsection is defined as below: -/// BPFOffsetReloc Size -/// struct SecOffsetReloc for ELF section #1 -/// A number of struct BPFOffsetReloc for ELF section #1 -/// struct SecOffsetReloc for ELF section #2 -/// A number of struct BPFOffsetReloc for ELF section #2 -/// ... -/// The ExternReloc subsection is defined as below: -/// BPFExternReloc Size -/// struct SecExternReloc for ELF section #1 -/// A number of struct BPFExternReloc for ELF section #1 -/// struct SecExternReloc for ELF section #2 -/// A number of struct BPFExternReloc for ELF section #2 +/// The FieldReloc subsection is defined as below: +/// BPFFieldReloc Size +/// struct SecFieldReloc for ELF section #1 +/// A number of struct BPFFieldReloc for ELF section #1 +/// struct SecFieldReloc for ELF section #2 +/// A number of struct BPFFieldReloc for ELF section #2 /// ... /// /// The section formats are also defined at @@ -63,7 +56,7 @@ enum : uint32_t { MAGIC = 0xeB9F, VERSION = 1 }; /// Sizes in bytes of various things in the BTF format. enum { HeaderSize = 24, - ExtHeaderSize = 40, + ExtHeaderSize = 32, CommonTypeSize = 12, BTFArraySize = 12, BTFEnumSize = 8, @@ -72,12 +65,10 @@ enum { BTFDataSecVarSize = 12, SecFuncInfoSize = 8, SecLineInfoSize = 8, - SecOffsetRelocSize = 8, - SecExternRelocSize = 8, + SecFieldRelocSize = 8, BPFFuncInfoSize = 8, BPFLineInfoSize = 16, - BPFOffsetRelocSize = 12, - BPFExternRelocSize = 8, + BPFFieldRelocSize = 16, }; /// The .BTF section header definition. @@ -213,10 +204,8 @@ struct ExtHeader { uint32_t FuncInfoLen; ///< Length of func info section uint32_t LineInfoOff; ///< Offset of line info section uint32_t LineInfoLen; ///< Length of line info section - uint32_t OffsetRelocOff; ///< Offset of offset reloc section - uint32_t OffsetRelocLen; ///< Length of offset reloc section - uint32_t ExternRelocOff; ///< Offset of extern reloc section - uint32_t ExternRelocLen; ///< Length of extern reloc section + uint32_t FieldRelocOff; ///< Offset of offset reloc section + uint32_t FieldRelocLen; ///< Length of offset reloc section }; /// Specifying one function info. @@ -247,28 +236,17 @@ struct SecLineInfo { }; /// Specifying one offset relocation. -struct BPFOffsetReloc { +struct BPFFieldReloc { uint32_t InsnOffset; ///< Byte offset in this section uint32_t TypeID; ///< TypeID for the relocation uint32_t OffsetNameOff; ///< The string to traverse types + uint32_t RelocKind; ///< What to patch the instruction }; /// Specifying offset relocation's in one section. -struct SecOffsetReloc { - uint32_t SecNameOff; ///< Section name index in the .BTF string table - uint32_t NumOffsetReloc; ///< Number of offset reloc's in this section -}; - -/// Specifying one offset relocation. -struct BPFExternReloc { - uint32_t InsnOffset; ///< Byte offset in this section - uint32_t ExternNameOff; ///< The string for external variable -}; - -/// Specifying extern relocation's in one section. -struct SecExternReloc { +struct SecFieldReloc { uint32_t SecNameOff; ///< Section name index in the .BTF string table - uint32_t NumExternReloc; ///< Number of extern reloc's in this section + uint32_t NumFieldReloc; ///< Number of offset reloc's in this section }; } // End namespace BTF. diff --git a/lib/Target/BPF/BTFDebug.cpp b/lib/Target/BPF/BTFDebug.cpp index fa35c6619e21..db551e739bd7 100644 --- a/lib/Target/BPF/BTFDebug.cpp +++ b/lib/Target/BPF/BTFDebug.cpp @@ -184,9 +184,7 @@ void BTFTypeEnum::emitType(MCStreamer &OS) { } } -BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, - uint32_t NumElems) - : ElemSize(ElemSize) { +BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems) { Kind = BTF::BTF_KIND_ARRAY; BTFType.NameOff = 0; BTFType.Info = Kind << 24; @@ -216,12 +214,6 @@ void BTFTypeArray::emitType(MCStreamer &OS) { OS.EmitIntValue(ArrayInfo.Nelems, 4); } -void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset, - uint32_t &ElementTypeId) { - ElementTypeId = ArrayInfo.ElemType; - LocOffset = Loc * ElemSize; -} - /// Represent either a struct or a union. BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct, bool HasBitField, uint32_t Vlen) @@ -251,7 +243,8 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) { } else { BTFMember.Offset = DDTy->getOffsetInBits(); } - BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType()); + const auto *BaseTy = DDTy->getBaseType(); + BTFMember.Type = BDebug.getTypeId(BaseTy); Members.push_back(BTFMember); } } @@ -268,15 +261,6 @@ void BTFTypeStruct::emitType(MCStreamer &OS) { std::string BTFTypeStruct::getName() { return STy->getName(); } -void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset, - uint32_t &MemberType) { - MemberType = Members[Loc].Type; - MemberOffset = - HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset; -} - -uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; } - /// The Func kind represents both subprogram and pointee of function /// pointers. If the FuncName is empty, it represents a pointee of function /// pointer. Otherwise, it represents a subprogram. The func arg names @@ -428,7 +412,7 @@ void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) { // Create a BTF type instance for this DIBasicType and put it into // DIToIdMap for cross-type reference check. - auto TypeEntry = llvm::make_unique<BTFTypeInt>( + auto TypeEntry = std::make_unique<BTFTypeInt>( Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName()); TypeId = addType(std::move(TypeEntry), BTy); } @@ -447,7 +431,7 @@ void BTFDebug::visitSubroutineType( // a function pointer has an empty name. The subprogram type will // not be added to DIToIdMap as it should not be referenced by // any other types. - auto TypeEntry = llvm::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames); + auto TypeEntry = std::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames); if (ForSubprog) TypeId = addType(std::move(TypeEntry)); // For subprogram else @@ -478,7 +462,7 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct, } auto TypeEntry = - llvm::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen); + std::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen); StructTypes.push_back(TypeEntry.get()); TypeId = addType(std::move(TypeEntry), CTy); @@ -489,35 +473,29 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct, void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) { // Visit array element type. - uint32_t ElemTypeId, ElemSize; + uint32_t ElemTypeId; const DIType *ElemType = CTy->getBaseType(); visitTypeEntry(ElemType, ElemTypeId, false, false); - ElemSize = ElemType->getSizeInBits() >> 3; - if (!CTy->getSizeInBits()) { - auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0); - ArrayTypes.push_back(TypeEntry.get()); - ElemTypeId = addType(std::move(TypeEntry), CTy); - } else { - // Visit array dimensions. - DINodeArray Elements = CTy->getElements(); - for (int I = Elements.size() - 1; I >= 0; --I) { - if (auto *Element = dyn_cast_or_null<DINode>(Elements[I])) - if (Element->getTag() == dwarf::DW_TAG_subrange_type) { - const DISubrange *SR = cast<DISubrange>(Element); - auto *CI = SR->getCount().dyn_cast<ConstantInt *>(); - int64_t Count = CI->getSExtValue(); + // Visit array dimensions. + DINodeArray Elements = CTy->getElements(); + for (int I = Elements.size() - 1; I >= 0; --I) { + if (auto *Element = dyn_cast_or_null<DINode>(Elements[I])) + if (Element->getTag() == dwarf::DW_TAG_subrange_type) { + const DISubrange *SR = cast<DISubrange>(Element); + auto *CI = SR->getCount().dyn_cast<ConstantInt *>(); + int64_t Count = CI->getSExtValue(); - auto TypeEntry = - llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count); - ArrayTypes.push_back(TypeEntry.get()); - if (I == 0) - ElemTypeId = addType(std::move(TypeEntry), CTy); - else - ElemTypeId = addType(std::move(TypeEntry)); - ElemSize = ElemSize * Count; - } - } + // For struct s { int b; char c[]; }, the c[] will be represented + // as an array with Count = -1. + auto TypeEntry = + std::make_unique<BTFTypeArray>(ElemTypeId, + Count >= 0 ? Count : 0); + if (I == 0) + ElemTypeId = addType(std::move(TypeEntry), CTy); + else + ElemTypeId = addType(std::move(TypeEntry)); + } } // The array TypeId is the type id of the outermost dimension. @@ -526,7 +504,7 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) { // The IR does not have a type for array index while BTF wants one. // So create an array index type if there is none. if (!ArrayIndexTypeId) { - auto TypeEntry = llvm::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32, + auto TypeEntry = std::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32, 0, "__ARRAY_SIZE_TYPE__"); ArrayIndexTypeId = addType(std::move(TypeEntry)); } @@ -538,7 +516,7 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) { if (VLen > BTF::MAX_VLEN) return; - auto TypeEntry = llvm::make_unique<BTFTypeEnum>(CTy, VLen); + auto TypeEntry = std::make_unique<BTFTypeEnum>(CTy, VLen); TypeId = addType(std::move(TypeEntry), CTy); // No need to visit base type as BTF does not encode it. } @@ -546,7 +524,7 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) { /// Handle structure/union forward declarations. void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion, uint32_t &TypeId) { - auto TypeEntry = llvm::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion); + auto TypeEntry = std::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion); TypeId = addType(std::move(TypeEntry), CTy); } @@ -588,7 +566,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, /// Find a candidate, generate a fixup. Later on the struct/union /// pointee type will be replaced with either a real type or /// a forward declaration. - auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, true); + auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true); auto &Fixup = FixupDerivedTypes[CTy->getName()]; Fixup.first = CTag == dwarf::DW_TAG_union_type; Fixup.second.push_back(TypeEntry.get()); @@ -602,7 +580,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type || Tag == dwarf::DW_TAG_restrict_type) { - auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, false); + auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, false); TypeId = addType(std::move(TypeEntry), DTy); } else if (Tag != dwarf::DW_TAG_member) { return; @@ -669,7 +647,7 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { } auto TypeEntry = - llvm::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size()); + std::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size()); StructTypes.push_back(TypeEntry.get()); TypeId = addType(std::move(TypeEntry), CTy); @@ -774,9 +752,10 @@ void BTFDebug::emitBTFSection() { } void BTFDebug::emitBTFExtSection() { - // Do not emit section if empty FuncInfoTable and LineInfoTable. + // Do not emit section if empty FuncInfoTable and LineInfoTable + // and FieldRelocTable. if (!FuncInfoTable.size() && !LineInfoTable.size() && - !OffsetRelocTable.size() && !ExternRelocTable.size()) + !FieldRelocTable.size()) return; MCContext &Ctx = OS.getContext(); @@ -788,8 +767,8 @@ void BTFDebug::emitBTFExtSection() { // Account for FuncInfo/LineInfo record size as well. uint32_t FuncLen = 4, LineLen = 4; - // Do not account for optional OffsetReloc/ExternReloc. - uint32_t OffsetRelocLen = 0, ExternRelocLen = 0; + // Do not account for optional FieldReloc. + uint32_t FieldRelocLen = 0; for (const auto &FuncSec : FuncInfoTable) { FuncLen += BTF::SecFuncInfoSize; FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize; @@ -798,28 +777,20 @@ void BTFDebug::emitBTFExtSection() { LineLen += BTF::SecLineInfoSize; LineLen += LineSec.second.size() * BTF::BPFLineInfoSize; } - for (const auto &OffsetRelocSec : OffsetRelocTable) { - OffsetRelocLen += BTF::SecOffsetRelocSize; - OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize; - } - for (const auto &ExternRelocSec : ExternRelocTable) { - ExternRelocLen += BTF::SecExternRelocSize; - ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize; + for (const auto &FieldRelocSec : FieldRelocTable) { + FieldRelocLen += BTF::SecFieldRelocSize; + FieldRelocLen += FieldRelocSec.second.size() * BTF::BPFFieldRelocSize; } - if (OffsetRelocLen) - OffsetRelocLen += 4; - if (ExternRelocLen) - ExternRelocLen += 4; + if (FieldRelocLen) + FieldRelocLen += 4; OS.EmitIntValue(0, 4); OS.EmitIntValue(FuncLen, 4); OS.EmitIntValue(FuncLen, 4); OS.EmitIntValue(LineLen, 4); OS.EmitIntValue(FuncLen + LineLen, 4); - OS.EmitIntValue(OffsetRelocLen, 4); - OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4); - OS.EmitIntValue(ExternRelocLen, 4); + OS.EmitIntValue(FieldRelocLen, 4); // Emit func_info table. OS.AddComment("FuncInfo"); @@ -853,35 +824,20 @@ void BTFDebug::emitBTFExtSection() { } } - // Emit offset reloc table. - if (OffsetRelocLen) { - OS.AddComment("OffsetReloc"); - OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4); - for (const auto &OffsetRelocSec : OffsetRelocTable) { - OS.AddComment("Offset reloc section string offset=" + - std::to_string(OffsetRelocSec.first)); - OS.EmitIntValue(OffsetRelocSec.first, 4); - OS.EmitIntValue(OffsetRelocSec.second.size(), 4); - for (const auto &OffsetRelocInfo : OffsetRelocSec.second) { - Asm->EmitLabelReference(OffsetRelocInfo.Label, 4); - OS.EmitIntValue(OffsetRelocInfo.TypeID, 4); - OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4); - } - } - } - - // Emit extern reloc table. - if (ExternRelocLen) { - OS.AddComment("ExternReloc"); - OS.EmitIntValue(BTF::BPFExternRelocSize, 4); - for (const auto &ExternRelocSec : ExternRelocTable) { - OS.AddComment("Extern reloc section string offset=" + - std::to_string(ExternRelocSec.first)); - OS.EmitIntValue(ExternRelocSec.first, 4); - OS.EmitIntValue(ExternRelocSec.second.size(), 4); - for (const auto &ExternRelocInfo : ExternRelocSec.second) { - Asm->EmitLabelReference(ExternRelocInfo.Label, 4); - OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4); + // Emit field reloc table. + if (FieldRelocLen) { + OS.AddComment("FieldReloc"); + OS.EmitIntValue(BTF::BPFFieldRelocSize, 4); + for (const auto &FieldRelocSec : FieldRelocTable) { + OS.AddComment("Field reloc section string offset=" + + std::to_string(FieldRelocSec.first)); + OS.EmitIntValue(FieldRelocSec.first, 4); + OS.EmitIntValue(FieldRelocSec.second.size(), 4); + for (const auto &FieldRelocInfo : FieldRelocSec.second) { + Asm->EmitLabelReference(FieldRelocInfo.Label, 4); + OS.EmitIntValue(FieldRelocInfo.TypeID, 4); + OS.EmitIntValue(FieldRelocInfo.OffsetNameOff, 4); + OS.EmitIntValue(FieldRelocInfo.RelocKind, 4); } } } @@ -942,7 +898,7 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) { // Construct subprogram func type auto FuncTypeEntry = - llvm::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId); + std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId); uint32_t FuncTypeId = addType(std::move(FuncTypeEntry)); for (const auto &TypeEntry : TypeEntries) @@ -980,71 +936,27 @@ unsigned BTFDebug::populateStructType(const DIType *Ty) { return Id; } -// Find struct/array debuginfo types given a type id. -void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType, - BTFTypeArray **PrevArrayType) { - for (const auto &StructType : StructTypes) { - if (StructType->getId() == TypeId) { - *PrevStructType = StructType; - return; - } - } - for (const auto &ArrayType : ArrayTypes) { - if (ArrayType->getId() == TypeId) { - *PrevArrayType = ArrayType; - return; - } - } -} - -/// Generate a struct member offset relocation. -void BTFDebug::generateOffsetReloc(const MachineInstr *MI, +/// Generate a struct member field relocation. +void BTFDebug::generateFieldReloc(const MachineInstr *MI, const MCSymbol *ORSym, DIType *RootTy, StringRef AccessPattern) { - BTFTypeStruct *PrevStructType = nullptr; - BTFTypeArray *PrevArrayType = nullptr; unsigned RootId = populateStructType(RootTy); - setTypeFromId(RootId, &PrevStructType, &PrevArrayType); - unsigned RootTySize = PrevStructType->getStructSize(); - - BTFOffsetReloc OffsetReloc; - OffsetReloc.Label = ORSym; - OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back()); - OffsetReloc.TypeID = RootId; + size_t FirstDollar = AccessPattern.find_first_of('$'); + size_t FirstColon = AccessPattern.find_first_of(':'); + size_t SecondColon = AccessPattern.find_first_of(':', FirstColon + 1); + StringRef IndexPattern = AccessPattern.substr(FirstDollar + 1); + StringRef RelocKindStr = AccessPattern.substr(FirstColon + 1, + SecondColon - FirstColon); + StringRef PatchImmStr = AccessPattern.substr(SecondColon + 1, + FirstDollar - SecondColon); - uint32_t Start = 0, End = 0, Offset = 0; - bool FirstAccess = true; - for (auto C : AccessPattern) { - if (C != ':') { - End++; - } else { - std::string SubStr = AccessPattern.substr(Start, End - Start); - int Loc = std::stoi(SubStr); - - if (FirstAccess) { - Offset = Loc * RootTySize; - FirstAccess = false; - } else if (PrevStructType) { - uint32_t MemberOffset, MemberTypeId; - PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId); - - Offset += MemberOffset >> 3; - PrevStructType = nullptr; - setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType); - } else if (PrevArrayType) { - uint32_t LocOffset, ElementTypeId; - PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId); - - Offset += LocOffset; - PrevArrayType = nullptr; - setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType); - } - Start = End + 1; - End = Start; - } - } - AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset; - OffsetRelocTable[SecNameOff].push_back(OffsetReloc); + BTFFieldReloc FieldReloc; + FieldReloc.Label = ORSym; + FieldReloc.OffsetNameOff = addString(IndexPattern); + FieldReloc.TypeID = RootId; + FieldReloc.RelocKind = std::stoull(RelocKindStr); + PatchImms[AccessPattern.str()] = std::stoul(PatchImmStr); + FieldRelocTable[SecNameOff].push_back(FieldReloc); } void BTFDebug::processLDimm64(const MachineInstr *MI) { @@ -1052,7 +964,7 @@ void BTFDebug::processLDimm64(const MachineInstr *MI) { // will generate an .BTF.ext record. // // If the insn is "r2 = LD_imm64 @__BTF_...", - // add this insn into the .BTF.ext OffsetReloc subsection. + // add this insn into the .BTF.ext FieldReloc subsection. // Relocation looks like: // . SecName: // . InstOffset @@ -1083,16 +995,7 @@ void BTFDebug::processLDimm64(const MachineInstr *MI) { MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index); DIType *Ty = dyn_cast<DIType>(MDN); - generateOffsetReloc(MI, ORSym, Ty, GVar->getName()); - } else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() && - GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) { - MCSymbol *ORSym = OS.getContext().createTempSymbol(); - OS.EmitLabel(ORSym); - - BTFExternReloc ExternReloc; - ExternReloc.Label = ORSym; - ExternReloc.ExternNameOff = addString(GVar->getName()); - ExternRelocTable[SecNameOff].push_back(ExternReloc); + generateFieldReloc(MI, ORSym, Ty, GVar->getName()); } } } @@ -1200,12 +1103,12 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) { ? BTF::VAR_GLOBAL_ALLOCATED : BTF::VAR_STATIC; auto VarEntry = - llvm::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo); + std::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo); uint32_t VarId = addType(std::move(VarEntry)); // Find or create a DataSec if (DataSecEntries.find(SecName) == DataSecEntries.end()) { - DataSecEntries[SecName] = llvm::make_unique<BTFKindDataSec>(Asm, SecName); + DataSecEntries[SecName] = std::make_unique<BTFKindDataSec>(Asm, SecName); } // Calculate symbol size @@ -1224,30 +1127,12 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) { const GlobalValue *GVal = MO.getGlobal(); auto *GVar = dyn_cast<GlobalVariable>(GVal); if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) { - MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index); - DIType *Ty = dyn_cast<DIType>(MDN); - std::string TypeName = Ty->getName(); - int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()]; - - // Emit "mov ri, <imm>" for abstract member accesses. + // Emit "mov ri, <imm>" for patched immediate. + uint32_t Imm = PatchImms[GVar->getName().str()]; OutMI.setOpcode(BPF::MOV_ri); OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); OutMI.addOperand(MCOperand::createImm(Imm)); return true; - } else if (GVar && !GVar->hasInitializer() && - GVar->hasExternalLinkage() && - GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) { - const IntegerType *IntTy = dyn_cast<IntegerType>(GVar->getValueType()); - assert(IntTy); - // For patchable externals, emit "LD_imm64, ri, 0" if the external - // variable is 64bit width, emit "mov ri, 0" otherwise. - if (IntTy->getBitWidth() == 64) - OutMI.setOpcode(BPF::LD_imm64); - else - OutMI.setOpcode(BPF::MOV_ri); - OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); - OutMI.addOperand(MCOperand::createImm(0)); - return true; } } } @@ -1281,7 +1166,7 @@ void BTFDebug::endModule() { } if (StructTypeId == 0) { - auto FwdTypeEntry = llvm::make_unique<BTFTypeFwd>(TypeName, IsUnion); + auto FwdTypeEntry = std::make_unique<BTFTypeFwd>(TypeName, IsUnion); StructTypeId = addType(std::move(FwdTypeEntry)); } diff --git a/lib/Target/BPF/BTFDebug.h b/lib/Target/BPF/BTFDebug.h index 6c0cdde17d9b..c01e0d1d1612 100644 --- a/lib/Target/BPF/BTFDebug.h +++ b/lib/Target/BPF/BTFDebug.h @@ -104,15 +104,13 @@ public: /// Handle array type. class BTFTypeArray : public BTFTypeBase { - uint32_t ElemSize; struct BTF::BTFArray ArrayInfo; public: - BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems); + BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems); uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; } void completeType(BTFDebug &BDebug); void emitType(MCStreamer &OS); - void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId); }; /// Handle struct/union type. @@ -130,8 +128,6 @@ public: void completeType(BTFDebug &BDebug); void emitType(MCStreamer &OS); std::string getName(); - void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType); - uint32_t getStructSize(); }; /// Handle function pointer. @@ -199,7 +195,7 @@ class BTFStringTable { /// A mapping from string table offset to the index /// of the Table. It is used to avoid putting /// duplicated strings in the table. - std::unordered_map<uint32_t, uint32_t> OffsetToIdMap; + std::map<uint32_t, uint32_t> OffsetToIdMap; /// A vector of strings to represent the string table. std::vector<std::string> Table; @@ -228,16 +224,11 @@ struct BTFLineInfo { }; /// Represent one offset relocation. -struct BTFOffsetReloc { +struct BTFFieldReloc { const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc uint32_t TypeID; ///< Type ID uint32_t OffsetNameOff; ///< The string to traverse types -}; - -/// Represent one extern relocation. -struct BTFExternReloc { - const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc - uint32_t ExternNameOff; ///< The extern variable name + uint32_t RelocKind; ///< What to patch the instruction }; /// Collect and emit BTF information. @@ -253,13 +244,11 @@ class BTFDebug : public DebugHandlerBase { std::unordered_map<const DIType *, uint32_t> DIToIdMap; std::map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable; std::map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable; - std::map<uint32_t, std::vector<BTFOffsetReloc>> OffsetRelocTable; - std::map<uint32_t, std::vector<BTFExternReloc>> ExternRelocTable; + std::map<uint32_t, std::vector<BTFFieldReloc>> FieldRelocTable; StringMap<std::vector<std::string>> FileContent; std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries; std::vector<BTFTypeStruct *> StructTypes; - std::vector<BTFTypeArray *> ArrayTypes; - std::map<std::string, int64_t> AccessOffsets; + std::map<std::string, uint32_t> PatchImms; std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>> FixupDerivedTypes; @@ -305,13 +294,9 @@ class BTFDebug : public DebugHandlerBase { void processGlobals(bool ProcessingMapDef); /// Generate one offset relocation record. - void generateOffsetReloc(const MachineInstr *MI, const MCSymbol *ORSym, + void generateFieldReloc(const MachineInstr *MI, const MCSymbol *ORSym, DIType *RootTy, StringRef AccessPattern); - /// Set the to-be-traversed Struct/Array Type based on TypeId. - void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType, - BTFTypeArray **PrevArrayType); - /// Populating unprocessed struct type. unsigned populateStructType(const DIType *Ty); diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 057bbf5c3b06..ef4e324c3bdd 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -39,7 +39,7 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { // determine the type of the relocation - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); case FK_SecRel_8: @@ -85,5 +85,5 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, std::unique_ptr<MCObjectTargetWriter> llvm::createBPFELFObjectWriter(uint8_t OSABI) { - return llvm::make_unique<BPFELFObjectWriter>(OSABI); + return std::make_unique<BPFELFObjectWriter>(OSABI); } diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 0881bf841f90..590c4a2eb69d 100644 --- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -702,7 +702,7 @@ bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) { // Make sure we have a number (false is returned if expression is a number) if (!getParser().parseExpression(Value)) { // Make sure this is a number that is in range - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value); + auto *MCE = cast<MCConstantExpr>(Value); uint64_t IntValue = MCE->getValue(); if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue)) return Error(ExprLoc, "literal value out of range (256) for falign"); diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp index b7e95caf24fb..efd5ed915127 100644 --- a/lib/Target/Hexagon/BitTracker.cpp +++ b/lib/Target/Hexagon/BitTracker.cpp @@ -84,7 +84,7 @@ namespace { raw_ostream &operator<< (raw_ostream &OS, const printv &PV) { if (PV.R) - OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R); + OS << 'v' << Register::virtReg2Index(PV.R); else OS << 's'; return OS; @@ -201,7 +201,7 @@ BitTracker::~BitTracker() { bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) { // An example when "meet" can be invoked with SelfR == 0 is a phi node // with a physical register as an operand. - assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR)); + assert(SelfR == 0 || Register::isVirtualRegister(SelfR)); bool Changed = false; for (uint16_t i = 0, n = Bits.size(); i < n; ++i) { const BitValue &RCV = RC[i]; @@ -335,12 +335,13 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const { // 1. find a physical register PhysR from the same class as RR.Reg, // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub, // 3. find a register class that contains PhysS. - if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) { + if (Register::isVirtualRegister(RR.Reg)) { const auto &VC = composeWithSubRegIndex(*MRI.getRegClass(RR.Reg), RR.Sub); return TRI.getRegSizeInBits(VC); } - assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg)); - unsigned PhysR = (RR.Sub == 0) ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub); + assert(Register::isPhysicalRegister(RR.Reg)); + Register PhysR = + (RR.Sub == 0) ? Register(RR.Reg) : TRI.getSubReg(RR.Reg, RR.Sub); return getPhysRegBitWidth(PhysR); } @@ -350,10 +351,10 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR, // Physical registers are assumed to be present in the map with an unknown // value. Don't actually insert anything in the map, just return the cell. - if (TargetRegisterInfo::isPhysicalRegister(RR.Reg)) + if (Register::isPhysicalRegister(RR.Reg)) return RegisterCell::self(0, BW); - assert(TargetRegisterInfo::isVirtualRegister(RR.Reg)); + assert(Register::isVirtualRegister(RR.Reg)); // For virtual registers that belong to a class that is not tracked, // generate an "unknown" value as well. const TargetRegisterClass *C = MRI.getRegClass(RR.Reg); @@ -376,7 +377,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC, // While updating the cell map can be done in a meaningful way for // a part of a register, it makes little sense to implement it as the // SSA representation would never contain such "partial definitions". - if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + if (!Register::isVirtualRegister(RR.Reg)) return; assert(RR.Sub == 0 && "Unexpected sub-register in definition"); // Eliminate all ref-to-reg-0 bit values: replace them with "self". @@ -711,7 +712,7 @@ BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const { } uint16_t BT::MachineEvaluator::getPhysRegBitWidth(unsigned Reg) const { - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg); return TRI.getRegSizeInBits(PC); } @@ -874,7 +875,7 @@ void BT::visitNonBranch(const MachineInstr &MI) { continue; RegisterRef RD(MO); assert(RD.Sub == 0 && "Unexpected sub-register in definition"); - if (!TargetRegisterInfo::isVirtualRegister(RD.Reg)) + if (!Register::isVirtualRegister(RD.Reg)) continue; bool Changed = false; diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp index b07d15609ede..3d771d388e28 100644 --- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -130,7 +130,7 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!MO.isReg()) return true; - unsigned RegNumber = MO.getReg(); + Register RegNumber = MO.getReg(); // This should be an assert in the frontend. if (Hexagon::DoubleRegsRegClass.contains(RegNumber)) RegNumber = TRI->getSubReg(RegNumber, ExtraCode[0] == 'L' ? diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index 7b75d251ccd3..3068fb6f9629 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -147,11 +147,11 @@ namespace { } static inline unsigned v2x(unsigned v) { - return TargetRegisterInfo::virtReg2Index(v); + return Register::virtReg2Index(v); } static inline unsigned x2v(unsigned x) { - return TargetRegisterInfo::index2VirtReg(x); + return Register::index2VirtReg(x); } }; @@ -290,8 +290,8 @@ void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI, for (auto &Op : MI.operands()) { if (!Op.isReg() || !Op.isDef()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = Op.getReg(); + if (!Register::isVirtualRegister(R)) continue; Defs.insert(R); } @@ -302,8 +302,8 @@ void HexagonBitSimplify::getInstrUses(const MachineInstr &MI, for (auto &Op : MI.operands()) { if (!Op.isReg() || !Op.isUse()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = Op.getReg(); + if (!Register::isVirtualRegister(R)) continue; Uses.insert(R); } @@ -353,8 +353,7 @@ bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC, bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR, MachineRegisterInfo &MRI) { - if (!TargetRegisterInfo::isVirtualRegister(OldR) || - !TargetRegisterInfo::isVirtualRegister(NewR)) + if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR)) return false; auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); decltype(End) NextI; @@ -367,8 +366,7 @@ bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR, bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) { - if (!TargetRegisterInfo::isVirtualRegister(OldR) || - !TargetRegisterInfo::isVirtualRegister(NewR)) + if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR)) return false; if (hasTiedUse(OldR, MRI, NewSR)) return false; @@ -384,8 +382,7 @@ bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR, bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR, unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) { - if (!TargetRegisterInfo::isVirtualRegister(OldR) || - !TargetRegisterInfo::isVirtualRegister(NewR)) + if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR)) return false; if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR)) return false; @@ -896,7 +893,7 @@ bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN, // register class. const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass( const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) { - if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + if (!Register::isVirtualRegister(RR.Reg)) return nullptr; auto *RC = MRI.getRegClass(RR.Reg); if (RR.Sub == 0) @@ -927,8 +924,8 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass( // with a 32-bit register. bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD, const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) { - if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) || - !TargetRegisterInfo::isVirtualRegister(RS.Reg)) + if (!Register::isVirtualRegister(RD.Reg) || + !Register::isVirtualRegister(RS.Reg)) return false; // Return false if one (or both) classes are nullptr. auto *DRC = getFinalVRegClass(RD, MRI); @@ -979,7 +976,7 @@ bool DeadCodeElimination::isDead(unsigned R) const { continue; if (UseI->isPHI()) { assert(!UseI->getOperand(0).getSubReg()); - unsigned DR = UseI->getOperand(0).getReg(); + Register DR = UseI->getOperand(0).getReg(); if (DR == R) continue; } @@ -1018,8 +1015,8 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) { for (auto &Op : MI->operands()) { if (!Op.isReg() || !Op.isDef()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) { + Register R = Op.getReg(); + if (!Register::isVirtualRegister(R) || !isDead(R)) { AllDead = false; break; } @@ -1220,8 +1217,8 @@ bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) { return false; MachineInstr &UseI = *I->getParent(); if (UseI.isPHI() || UseI.isCopy()) { - unsigned DefR = UseI.getOperand(0).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DefR)) + Register DefR = UseI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(DefR)) return false; Pending.push_back(DefR); } else { @@ -1345,7 +1342,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B, // If found, replace the instruction with a COPY. const DebugLoc &DL = MI->getDebugLoc(); const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); - unsigned NewR = MRI.createVirtualRegister(FRC); + Register NewR = MRI.createVirtualRegister(FRC); MachineInstr *CopyI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR) .addReg(RS.Reg, 0, RS.Sub); @@ -1412,7 +1409,7 @@ bool ConstGeneration::isTfrConst(const MachineInstr &MI) { // register class and the actual value being transferred. unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C, MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) { - unsigned Reg = MRI.createVirtualRegister(RC); + Register Reg = MRI.createVirtualRegister(RC); if (RC == &Hexagon::IntRegsRegClass) { BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg) .addImm(int32_t(C)); @@ -1470,7 +1467,7 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) { if (Defs.count() != 1) continue; unsigned DR = Defs.find_first(); - if (!TargetRegisterInfo::isVirtualRegister(DR)) + if (!Register::isVirtualRegister(DR)) continue; uint64_t U; const BitTracker::RegisterCell &DRC = BT.lookup(DR); @@ -1609,7 +1606,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B, auto *FRC = HBS::getFinalVRegClass(R, MRI); if (findMatch(R, MR, AVB)) { - unsigned NewR = MRI.createVirtualRegister(FRC); + Register NewR = MRI.createVirtualRegister(FRC); BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR) .addReg(MR.Reg, 0, MR.Sub); BT.put(BitTracker::RegisterRef(NewR), BT.get(MR)); @@ -1628,7 +1625,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B, BitTracker::RegisterRef ML, MH; if (findMatch(TL, ML, AVB) && findMatch(TH, MH, AVB)) { auto *FRC = HBS::getFinalVRegClass(R, MRI); - unsigned NewR = MRI.createVirtualRegister(FRC); + Register NewR = MRI.createVirtualRegister(FRC); BuildMI(B, At, DL, HII.get(TargetOpcode::REG_SEQUENCE), NewR) .addReg(ML.Reg, 0, ML.Sub) .addImm(SubLo) @@ -1819,7 +1816,7 @@ bool BitSimplification::matchHalf(unsigned SelfR, if (Reg == 0 || Reg == SelfR) // Don't match "self". return false; - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return false; if (!BT.has(Reg)) return false; @@ -2025,7 +2022,7 @@ bool BitSimplification::genPackhl(MachineInstr *MI, return false; MachineBasicBlock &B = *MI->getParent(); - unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass); + Register NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass); DebugLoc DL = MI->getDebugLoc(); auto At = MI->isPHI() ? B.getFirstNonPHI() : MachineBasicBlock::iterator(MI); @@ -2097,7 +2094,7 @@ bool BitSimplification::genCombineHalf(MachineInstr *MI, MachineBasicBlock &B = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); auto At = MI->isPHI() ? B.getFirstNonPHI() : MachineBasicBlock::iterator(MI); BuildMI(B, At, DL, HII.get(COpc), NewR) @@ -2154,7 +2151,7 @@ bool BitSimplification::genExtractLow(MachineInstr *MI, if (!validateReg(RS, NewOpc, 1)) continue; - unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); auto At = MI->isPHI() ? B.getFirstNonPHI() : MachineBasicBlock::iterator(MI); auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR) @@ -2368,7 +2365,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI, return true; } } else if (V.is(0) || V.is(1)) { - unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); + Register NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); unsigned NewOpc = V.is(0) ? Hexagon::PS_false : Hexagon::PS_true; BuildMI(B, At, DL, HII.get(NewOpc), NewR); HBS::replaceReg(RD.Reg, NewR, MRI); @@ -2541,7 +2538,7 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI, DebugLoc DL = MI->getDebugLoc(); MachineBasicBlock &B = *MI->getParent(); - unsigned NewR = MRI.createVirtualRegister(FRC); + Register NewR = MRI.createVirtualRegister(FRC); auto At = MI->isPHI() ? B.getFirstNonPHI() : MachineBasicBlock::iterator(MI); auto MIB = BuildMI(B, At, DL, HII.get(ExtOpc), NewR) @@ -2612,8 +2609,8 @@ bool BitSimplification::simplifyRCmp0(MachineInstr *MI, KnownNZ = true; } - auto ReplaceWithConst = [&] (int C) { - unsigned NewR = MRI.createVirtualRegister(FRC); + auto ReplaceWithConst = [&](int C) { + Register NewR = MRI.createVirtualRegister(FRC); BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), NewR) .addImm(C); HBS::replaceReg(RD.Reg, NewR, MRI); @@ -2678,7 +2675,7 @@ bool BitSimplification::simplifyRCmp0(MachineInstr *MI, // replace the comparison with a C2_muxii, using the same predicate // register, but with operands substituted with 0/1 accordingly. if ((KnownZ1 || KnownNZ1) && (KnownZ2 || KnownNZ2)) { - unsigned NewR = MRI.createVirtualRegister(FRC); + Register NewR = MRI.createVirtualRegister(FRC); BuildMI(B, At, DL, HII.get(Hexagon::C2_muxii), NewR) .addReg(InpDef->getOperand(1).getReg()) .addImm(KnownZ1 == (Opc == Hexagon::A4_rcmpeqi)) @@ -3071,7 +3068,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB, DenseMap<unsigned,unsigned> RegMap; const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR); - unsigned PhiR = MRI->createVirtualRegister(PhiRC); + Register PhiR = MRI->createVirtualRegister(PhiRC); BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR) .addReg(NewPredR) .addMBB(&PB) @@ -3083,7 +3080,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB, const MachineInstr *SI = G.Ins[i-1]; unsigned DR = getDefReg(SI); const TargetRegisterClass *RC = MRI->getRegClass(DR); - unsigned NewDR = MRI->createVirtualRegister(RC); + Register NewDR = MRI->createVirtualRegister(RC); DebugLoc DL = SI->getDebugLoc(); auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR); @@ -3162,7 +3159,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) { if (Defs.count() != 1) continue; unsigned DefR = Defs.find_first(); - if (!TargetRegisterInfo::isVirtualRegister(DefR)) + if (!Register::isVirtualRegister(DefR)) continue; if (!isBitShuffle(&*I, DefR)) continue; diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp index ba50faac2cf9..ebd060ce503e 100644 --- a/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -111,7 +111,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { } uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const { - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); using namespace Hexagon; const auto &HST = MF.getSubtarget<HexagonSubtarget>(); @@ -1042,8 +1042,8 @@ unsigned HexagonEvaluator::getUniqueDefVReg(const MachineInstr &MI) const { for (const MachineOperand &Op : MI.operands()) { if (!Op.isReg() || !Op.isDef()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = Op.getReg(); + if (!Register::isVirtualRegister(R)) continue; if (DefReg != 0) return 0; @@ -1220,7 +1220,7 @@ bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI, RegisterRef RD = MI.getOperand(0); RegisterRef RS = MI.getOperand(1); assert(RD.Sub == 0); - if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg)) + if (!Register::isPhysicalRegister(RS.Reg)) return false; RegExtMap::const_iterator F = VRX.find(RD.Reg); if (F == VRX.end()) diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp index 999150fc8c6e..d1d1b8ee7d41 100644 --- a/lib/Target/Hexagon/HexagonBlockRanges.cpp +++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp @@ -268,14 +268,14 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs( return SRs; } - if (TargetRegisterInfo::isPhysicalRegister(R.Reg)) { + if (Register::isPhysicalRegister(R.Reg)) { MCSubRegIterator I(R.Reg, &TRI); if (!I.isValid()) SRs.insert({R.Reg, 0}); for (; I.isValid(); ++I) SRs.insert({*I, 0}); } else { - assert(TargetRegisterInfo::isVirtualRegister(R.Reg)); + assert(Register::isVirtualRegister(R.Reg)); auto &RC = *MRI.getRegClass(R.Reg); unsigned PReg = *RC.begin(); MCSubRegIndexIterator I(PReg, &TRI); @@ -321,7 +321,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap, if (!Op.isReg() || !Op.isUse() || Op.isUndef()) continue; RegisterRef R = { Op.getReg(), Op.getSubReg() }; - if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg]) + if (Register::isPhysicalRegister(R.Reg) && Reserved[R.Reg]) continue; bool IsKill = Op.isKill(); for (auto S : expandToSubRegs(R, MRI, TRI)) { @@ -338,7 +338,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap, continue; RegisterRef R = { Op.getReg(), Op.getSubReg() }; for (auto S : expandToSubRegs(R, MRI, TRI)) { - if (TargetRegisterInfo::isPhysicalRegister(S.Reg) && Reserved[S.Reg]) + if (Register::isPhysicalRegister(S.Reg) && Reserved[S.Reg]) continue; if (Op.isDead()) Clobbers.insert(S); @@ -374,7 +374,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap, // Update maps for defs. for (RegisterRef S : Defs) { // Defs should already be expanded into subregs. - assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) || + assert(!Register::isPhysicalRegister(S.Reg) || !MCSubRegIterator(S.Reg, &TRI, false).isValid()); if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None) closeRange(S); @@ -383,7 +383,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap, // Update maps for clobbers. for (RegisterRef S : Clobbers) { // Clobbers should already be expanded into subregs. - assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) || + assert(!Register::isPhysicalRegister(S.Reg) || !MCSubRegIterator(S.Reg, &TRI, false).isValid()); if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None) closeRange(S); @@ -482,7 +482,7 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap( } } for (auto &P : LiveMap) - if (TargetRegisterInfo::isVirtualRegister(P.first.Reg)) + if (Register::isVirtualRegister(P.first.Reg)) addDeadRanges(P.first); LLVM_DEBUG(dbgs() << __func__ << ": dead map\n" diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp index ee93739b2c7b..08f740806879 100644 --- a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp +++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp @@ -105,12 +105,11 @@ void HexagonBranchRelaxation::computeOffset(MachineFunction &MF, // offset of the current instruction from the start. unsigned InstOffset = 0; for (auto &B : MF) { - if (B.getAlignment()) { + if (B.getAlignment() != Align::None()) { // Although we don't know the exact layout of the final code, we need // to account for alignment padding somehow. This heuristic pads each // aligned basic block according to the alignment value. - int ByteAlign = (1u << B.getAlignment()) - 1; - InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign); + InstOffset = alignTo(InstOffset, B.getAlignment()); } OffsetMap[&B] = InstOffset; for (auto &MI : B.instrs()) { diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp index cfed0ecef272..ddc9b847ef1c 100644 --- a/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -14,9 +14,10 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Pass.h" #include <map> #include <set> #include <utility> @@ -235,24 +236,24 @@ namespace { Reg = Op.getReg(); Sub = Op.getSubReg(); } else if (Op.isFI()) { - Reg = TargetRegisterInfo::index2StackSlot(Op.getIndex()); + Reg = llvm::Register::index2StackSlot(Op.getIndex()); } return *this; } bool isVReg() const { - return Reg != 0 && !TargetRegisterInfo::isStackSlot(Reg) && - TargetRegisterInfo::isVirtualRegister(Reg); + return Reg != 0 && !llvm::Register::isStackSlot(Reg) && + llvm::Register::isVirtualRegister(Reg); } bool isSlot() const { - return Reg != 0 && TargetRegisterInfo::isStackSlot(Reg); + return Reg != 0 && llvm::Register::isStackSlot(Reg); } operator MachineOperand() const { if (isVReg()) return MachineOperand::CreateReg(Reg, /*Def*/false, /*Imp*/false, /*Kill*/false, /*Dead*/false, /*Undef*/false, /*EarlyClobber*/false, Sub); - if (TargetRegisterInfo::isStackSlot(Reg)) { - int FI = TargetRegisterInfo::stackSlot2Index(Reg); + if (llvm::Register::isStackSlot(Reg)) { + int FI = llvm::Register::stackSlot2Index(Reg); return MachineOperand::CreateFI(FI); } llvm_unreachable("Cannot create MachineOperand"); @@ -1524,7 +1525,7 @@ void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs, } HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) { - unsigned DefR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); + llvm::Register DefR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); MachineBasicBlock &MBB = *DefL.Block; MachineBasicBlock::iterator At = DefL.At; DebugLoc dl = DefL.Block->findDebugLoc(DefL.At); diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp index d1fde5da5fe8..a82501cabb9b 100644 --- a/lib/Target/Hexagon/HexagonConstPropagation.cpp +++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp @@ -208,14 +208,14 @@ namespace { bool has(unsigned R) const { // All non-virtual registers are considered "bottom". - if (!TargetRegisterInfo::isVirtualRegister(R)) + if (!Register::isVirtualRegister(R)) return true; MapType::const_iterator F = Map.find(R); return F != Map.end(); } const LatticeCell &get(unsigned R) const { - if (!TargetRegisterInfo::isVirtualRegister(R)) + if (!Register::isVirtualRegister(R)) return Bottom; MapType::const_iterator F = Map.find(R); if (F != Map.end()) @@ -623,7 +623,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) { const MachineOperand &MD = PN.getOperand(0); RegisterSubReg DefR(MD); - assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg)); + assert(Register::isVirtualRegister(DefR.Reg)); bool Changed = false; @@ -652,7 +652,7 @@ Bottomize: RegisterSubReg UseR(SO); // If the input is not a virtual register, we don't really know what // value it holds. - if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg)) + if (!Register::isVirtualRegister(UseR.Reg)) goto Bottomize; // If there is no cell for an input register, it means top. if (!Cells.has(UseR.Reg)) @@ -694,7 +694,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) { continue; RegisterSubReg DefR(MO); // Only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg)) + if (!Register::isVirtualRegister(DefR.Reg)) continue; bool Changed = false; // If the evaluation failed, set cells for all output registers to bottom. @@ -1070,7 +1070,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) { bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs, LatticeCell &RC) { - if (!TargetRegisterInfo::isVirtualRegister(R.Reg)) + if (!Register::isVirtualRegister(R.Reg)) return false; const LatticeCell &L = Inputs.get(R.Reg); if (!R.SubReg) { @@ -1926,7 +1926,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI, unsigned Opc = MI.getOpcode(); RegisterSubReg DefR(MD); assert(!DefR.SubReg); - if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg)) + if (!Register::isVirtualRegister(DefR.Reg)) return false; if (MI.isCopy()) { @@ -2793,7 +2793,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI, if (!MO.isReg() || !MO.isUse() || MO.isImplicit()) continue; RegisterSubReg R(MO); - if (!TargetRegisterInfo::isVirtualRegister(R.Reg)) + if (!Register::isVirtualRegister(R.Reg)) continue; HasUse = true; // PHIs can legitimately have "top" cells after propagation. @@ -2813,7 +2813,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI, for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isUse() || MO.isImplicit()) continue; - unsigned R = MO.getReg(); + Register R = MO.getReg(); dbgs() << printReg(R, &TRI) << ": " << Inputs.get(R) << "\n"; } } @@ -2831,8 +2831,8 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI, for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) continue; assert(!MO.getSubReg()); assert(Inputs.has(R)); @@ -2871,7 +2871,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI, const MCInstrDesc *NewD = (Ps & P::Zero) ? &HII.get(Hexagon::PS_false) : &HII.get(Hexagon::PS_true); - unsigned NewR = MRI->createVirtualRegister(PredRC); + Register NewR = MRI->createVirtualRegister(PredRC); const MachineInstrBuilder &MIB = BuildMI(B, At, DL, *NewD, NewR); (void)MIB; #ifndef NDEBUG @@ -2893,7 +2893,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI, NewRC = &Hexagon::IntRegsRegClass; else NewRC = &Hexagon::DoubleRegsRegClass; - unsigned NewR = MRI->createVirtualRegister(NewRC); + Register NewR = MRI->createVirtualRegister(NewRC); const MachineInstr *NewMI; if (W == 32) { @@ -3009,7 +3009,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, if (V < 0) V = -V; const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg); - unsigned NewR = MRI->createVirtualRegister(RC); + Register NewR = MRI->createVirtualRegister(RC); const MachineOperand &Src1 = MI.getOperand(1); NewMI = BuildMI(B, At, DL, D, NewR) .addReg(Src1.getReg(), getRegState(Src1), Src1.getSubReg()) @@ -3111,8 +3111,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI, void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg, unsigned ToReg) { - assert(TargetRegisterInfo::isVirtualRegister(FromReg)); - assert(TargetRegisterInfo::isVirtualRegister(ToReg)); + assert(Register::isVirtualRegister(FromReg)); + assert(Register::isVirtualRegister(ToReg)); for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) { MachineOperand &O = *I; ++I; diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp index a09ccab483cf..394a329ac447 100644 --- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -133,8 +133,8 @@ static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII, const MachineOperand &Op1 = MI.getOperand(1); assert(Op0.isReg() && Op1.isReg()); - unsigned DestReg = Op0.getReg(); - unsigned SrcReg = Op1.getReg(); + Register DestReg = Op0.getReg(); + Register SrcReg = Op1.getReg(); return Hexagon::IntRegsRegClass.contains(DestReg) && Hexagon::IntRegsRegClass.contains(SrcReg); } @@ -146,7 +146,7 @@ static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII, const MachineOperand &Op1 = MI.getOperand(1); assert(Op0.isReg()); - unsigned DestReg = Op0.getReg(); + Register DestReg = Op0.getReg(); // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a // workaround for an ABI bug that prevents GOT relocations on combine // instructions @@ -226,7 +226,7 @@ static bool areCombinableOperations(const TargetRegisterInfo *TRI, } static bool isEvenReg(unsigned Reg) { - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); if (Hexagon::IntRegsRegClass.contains(Reg)) return (Reg - Hexagon::R0) % 2 == 0; if (Hexagon::HvxVRRegClass.contains(Reg)) @@ -265,7 +265,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1, unsigned I1DestReg, unsigned I2DestReg, bool &DoInsertAtI1) { - unsigned I2UseReg = UseReg(I2.getOperand(1)); + Register I2UseReg = UseReg(I2.getOperand(1)); // It is not safe to move I1 and I2 into one combine if I2 has a true // dependence on I1. @@ -332,7 +332,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1, // At O3 we got better results (dhrystone) by being more conservative here. if (!ShouldCombineAggressively) End = std::next(MachineBasicBlock::iterator(I2)); - unsigned I1UseReg = UseReg(I1.getOperand(1)); + Register I1UseReg = UseReg(I1.getOperand(1)); // Track killed operands. If we move across an instruction that kills our // operand, we need to update the kill information on the moved I1. It kills // the operand now. @@ -410,7 +410,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) { continue; // Look for the defining instruction. - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); MachineInstr *DefInst = LastDef[Reg]; if (!DefInst) continue; @@ -442,7 +442,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) { if (Op.isReg()) { if (!Op.isDef() || !Op.getReg()) continue; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); if (Hexagon::DoubleRegsRegClass.contains(Reg)) { for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) LastDef[*SubRegs] = &MI; @@ -528,7 +528,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1, while (I2 != I1.getParent()->end() && I2->isDebugInstr()) ++I2; - unsigned I1DestReg = I1.getOperand(0).getReg(); + Register I1DestReg = I1.getOperand(0).getReg(); for (MachineBasicBlock::iterator End = I1.getParent()->end(); I2 != End; ++I2) { @@ -544,7 +544,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1, if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&*I2)) continue; - unsigned I2DestReg = I2->getOperand(0).getReg(); + Register I2DestReg = I2->getOperand(0).getReg(); // Check that registers are adjacent and that the first destination register // is even. @@ -579,8 +579,8 @@ void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2, ++MI; // Figure out whether I1 or I2 goes into the lowreg part. - unsigned I1DestReg = I1.getOperand(0).getReg(); - unsigned I2DestReg = I2.getOperand(0).getReg(); + Register I1DestReg = I1.getOperand(0).getReg(); + Register I2DestReg = I2.getOperand(0).getReg(); bool IsI1Loreg = (I2DestReg - I1DestReg) == 1; unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg; unsigned SubLo; @@ -758,7 +758,7 @@ void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt, unsigned DoubleDestReg, MachineOperand &HiOperand, MachineOperand &LoOperand) { - unsigned LoReg = LoOperand.getReg(); + Register LoReg = LoOperand.getReg(); unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill()); DebugLoc DL = InsertPt->getDebugLoc(); @@ -807,7 +807,7 @@ void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt, MachineOperand &HiOperand, MachineOperand &LoOperand) { unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill()); - unsigned HiReg = HiOperand.getReg(); + Register HiReg = HiOperand.getReg(); DebugLoc DL = InsertPt->getDebugLoc(); MachineBasicBlock *BB = InsertPt->getParent(); @@ -857,8 +857,8 @@ void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt, MachineOperand &LoOperand) { unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill()); unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill()); - unsigned LoReg = LoOperand.getReg(); - unsigned HiReg = HiOperand.getReg(); + Register LoReg = LoOperand.getReg(); + Register HiReg = HiOperand.getReg(); DebugLoc DL = InsertPt->getDebugLoc(); MachineBasicBlock *BB = InsertPt->getParent(); diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 2ce1419e4790..e4a2ba0ec29c 100644 --- a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -37,12 +37,12 @@ def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2), (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1), (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2), - (S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), - (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred:$src2), - (A4_combineri IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2), + (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -75,8 +75,8 @@ def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2), (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2), (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2), - (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2), + (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1), (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2), @@ -89,10 +89,10 @@ def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2), (C2_cmpgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2), (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2), - (A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2), - (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2), + (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2), + (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2), (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1), @@ -103,12 +103,12 @@ def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3), - (S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3), + (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2), (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1), @@ -125,10 +125,10 @@ def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2), (A4_cmpbgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2), - (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2), @@ -137,28 +137,28 @@ def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2), (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2), - (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2), - (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2), + (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2), (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2), (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2), - (A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2), + (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2), (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2), (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2), - (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2), + (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2), (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2), @@ -177,10 +177,10 @@ def: Pat<(int_hexagon_A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), (A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2), (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), - (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2), - (S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2), + (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -199,10 +199,10 @@ def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2), (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2), (C2_bitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2), - (M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2), - (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2), + (M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2), + (M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2), (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2), @@ -225,10 +225,10 @@ def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1), (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2), (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1), (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2), @@ -241,12 +241,12 @@ def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$sr (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2), (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4), - (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4), + (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2), (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2), - (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2), + (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -259,8 +259,8 @@ def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1), (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2), - (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2), + (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1), (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2), @@ -279,8 +279,8 @@ def: Pat<(int_hexagon_C2_any8 PredRegs:$src1), (C2_any8 PredRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2), (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2), - (S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1), (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1), @@ -303,10 +303,10 @@ def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3) (C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2), - (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2), + (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), @@ -323,34 +323,34 @@ def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleR (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2), (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2), (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred:$src1), - (A2_tfrsi s32_0ImmPred:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1), + (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2), (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2), - (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2), - (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2), + (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2), (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), - (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2), + (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2), (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2), @@ -381,10 +381,10 @@ def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3 (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3), - (M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2), @@ -453,8 +453,8 @@ def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2), - (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2), + (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1), @@ -469,8 +469,8 @@ def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2), (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2), - (C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2), + (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2), (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2), @@ -535,10 +535,10 @@ def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3 (C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2), (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -557,30 +557,30 @@ def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src (M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3), - (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3), + (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2), (S2_tstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3), - (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3), + (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2), - (S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2), (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2), (M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2), - (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2), @@ -605,14 +605,14 @@ def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1), (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2), (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2), - (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2), + (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2), (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2), - (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2), + (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2), (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2), @@ -633,8 +633,8 @@ def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3 (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1), (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3), - (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3), + (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2), (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2), @@ -653,8 +653,8 @@ def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2), (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2), - (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1), (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), @@ -669,32 +669,32 @@ def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2), (C2_andn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2), (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2), - (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2), (M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2), - (A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2), + (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2), (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2), (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2), - (A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_combineir s32_0ImmPred:$src1, IntRegs:$src2), - (A4_combineir s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2), + (A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2), + (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), (C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -703,8 +703,8 @@ def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2), (A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2), (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1), (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), @@ -721,8 +721,8 @@ def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2), (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2), (C4_nbitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -745,10 +745,10 @@ def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred:$src1), - (F2_sfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred:$src1), - (F2_sfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1), + (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1), + (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2), (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), @@ -759,14 +759,14 @@ def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1), (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2), - (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2), + (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2), (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1), (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2), @@ -783,8 +783,8 @@ def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2), (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2), @@ -817,16 +817,16 @@ def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2), (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2), - (C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2), + (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_tfrcpp CtrRegs64:$src1), (A4_tfrcpp CtrRegs64:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2), - (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2), - (A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), @@ -837,8 +837,8 @@ def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRe (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2), (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2), (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2), @@ -851,14 +851,14 @@ def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_satb IntRegs:$src1), (A2_satb IntRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4), - (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4), + (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2), (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2), @@ -925,8 +925,8 @@ def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2), (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2), (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3), - (C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3), + (C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), @@ -937,8 +937,8 @@ def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -947,8 +947,8 @@ def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2), (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2), (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -957,16 +957,16 @@ def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2), (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3), - (M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3), - (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3), + (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3), + (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_tfrcrr CtrRegs:$src1), (A2_tfrcrr CtrRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3), - (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3), + (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2), (C2_orn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -1005,8 +1005,8 @@ def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntR (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2), (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2), (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2), @@ -1017,10 +1017,10 @@ def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2), (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1), (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2), @@ -1035,22 +1035,22 @@ def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1), (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2), - (F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2), + (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3), - (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3), + (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2), (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred:$src2), - (A2_addi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2), (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2), @@ -1063,8 +1063,8 @@ def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2), (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2), @@ -1131,12 +1131,12 @@ def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2), (C2_and PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1), (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3), - (S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3), + (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1), (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2), - (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2), + (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), @@ -1167,8 +1167,8 @@ def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2), (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2), - (A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2), + (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2), (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2), @@ -1185,26 +1185,26 @@ def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1), (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred:$src1), - (F2_dfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1), + (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2), (A4_cmphgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred:$src1), - (F2_dfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1), + (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2), (M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3), - (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3), + (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2), (M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3), - (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3), + (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -1215,20 +1215,20 @@ def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2), (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2), (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred:$src2), - (A2_andir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sfrecipa IntRegs:$src1, IntRegs:$src2), (F2_sfrecipa IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2), - (A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2), + (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2), (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2), - (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2), + (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2), - (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2), + (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2), (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2), @@ -1251,16 +1251,16 @@ def: Pat<(int_hexagon_A2_satub IntRegs:$src1), (A2_satub IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2), (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2), (C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2), - (A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred:$src2), - (A2_tfril IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3), - (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2), + (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2), + (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3), + (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1), (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1), @@ -1269,14 +1269,14 @@ def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1), (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2), (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3), - (C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3), + (C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_swiz IntRegs:$src1), (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2), (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2), @@ -1295,44 +1295,44 @@ def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3), - (S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3), + (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2), (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1), (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_tfr IntRegs:$src1), (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_subri s32_0ImmPred:$src1, IntRegs:$src2), - (A2_subri s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2), + (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2), (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2), - (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_brev IntRegs:$src1), (S2_brev IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2), - (S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2), - (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2), + (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2), @@ -1343,8 +1343,8 @@ def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2), (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2), (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), - (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3), + (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1), (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2), @@ -1357,24 +1357,24 @@ def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4), - (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4), + (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1), (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2), (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred:$src2), - (A2_orir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2), (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2), (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2), (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2), - (M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2), + (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2), (M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -1397,10 +1397,10 @@ def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1), (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2), - (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2), + (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1), (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), @@ -1423,8 +1423,8 @@ def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2), (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4), (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2), - (F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2), + (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1), (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1), @@ -1433,16 +1433,16 @@ def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2), (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2), (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2), - (C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2), + (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2), (C4_cmplteu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2), (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2), (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2), - (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2), (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2), @@ -1471,14 +1471,14 @@ def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2), (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2), (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2), - (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2), + (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2), (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2), (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2), @@ -1499,8 +1499,8 @@ def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2), (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_lsli s6_0ImmPred:$src1, IntRegs:$src2), - (S4_lsli s6_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2), + (S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2), (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2), @@ -1529,8 +1529,8 @@ def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2), (A4_cmpbeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1), (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2), - (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2), (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1), @@ -1541,10 +1541,10 @@ def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1), (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2), (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2), - (A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2), - (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2), + (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2), + (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2), (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -1563,12 +1563,12 @@ def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2), (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2), (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3), - (S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3), + (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2), (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2), - (S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2), + (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1), @@ -1589,16 +1589,16 @@ def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$sr (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2), - (A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2), + (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -1623,8 +1623,8 @@ def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntReg (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2), - (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2), + (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), @@ -1637,10 +1637,10 @@ def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRe (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2), - (S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2), (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2), @@ -1655,8 +1655,8 @@ def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2), (C2_bitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2), - (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2), + (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2), (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), @@ -1673,14 +1673,14 @@ def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2), (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2), (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2), - (C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2), - (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2), + (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2), + (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>; // V55 Scalar Instructions. @@ -1689,30 +1689,30 @@ def: Pat<(int_hexagon_A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src // V60 Scalar Instructions. -def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), - (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2), - (S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), - (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; -def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), - (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2), + (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2), + (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3), + (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3), + (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>; // V62 Scalar Instructions. @@ -1744,8 +1744,8 @@ def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2), (F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>; def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>; -def: Pat<(int_hexagon_S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2), - (S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2)>, Requires<[HasV66]>; +def: Pat<(int_hexagon_S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2), + (S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV66]>; // V60 HVX Instructions. @@ -1773,10 +1773,10 @@ def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2), (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2), (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), - (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), - (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3), + (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3), + (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2), (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2), @@ -1789,10 +1789,10 @@ def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2), (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2), (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), - (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), - (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4), + (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4), + (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2), (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2), @@ -2369,10 +2369,10 @@ def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2), (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2), (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), - (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), - (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4), + (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4), + (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1), (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1), @@ -2489,10 +2489,10 @@ def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), - (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), - (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3), + (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3), + (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), @@ -2677,10 +2677,10 @@ def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2), (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2), @@ -2721,10 +2721,10 @@ def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2), (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2), @@ -2885,10 +2885,10 @@ def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2), (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2), (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), - (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), - (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3), + (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3), + (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2), (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2), @@ -2929,10 +2929,10 @@ def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2), (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2), (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), - (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), - (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4), + (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4), + (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>; def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2), (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2), @@ -3016,10 +3016,10 @@ def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2), (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2), (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>; def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2), (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2), @@ -3032,10 +3032,10 @@ def: Pat<(int_hexagon_V6_ldtp0 PredRegs:$src1, IntRegs:$src2), (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_ldtp0_128B PredRegs:$src1, IntRegs:$src2), (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), - (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), - (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4), + (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4), + (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>; def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2), (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2), @@ -3124,10 +3124,10 @@ def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$sr (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), - (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3), + (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>; def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2), (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2), @@ -3188,10 +3188,10 @@ def: Pat<(int_hexagon_V6_ldcnp0 PredRegs:$src1, IntRegs:$src2), (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_ldcnp0_128B PredRegs:$src1, IntRegs:$src2), (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; -def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), - (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>; -def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), - (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4), + (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4), + (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>; def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2), (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2), diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td index fdba7b971258..8a94d96522cc 100644 --- a/lib/Target/Hexagon/HexagonDepOperands.td +++ b/lib/Target/Hexagon/HexagonDepOperands.td @@ -8,120 +8,125 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// +multiclass ImmOpPred<code pred, ValueType vt = i32> { + def "" : PatLeaf<(vt imm), pred>; + def _timm : PatLeaf<(vt timm), pred>; +} + def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; } def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; } -def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>; +defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>; def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; } def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; } -def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>; +defm s29_3ImmPred : ImmOpPred<[{ return isShiftedInt<32, 3>(N->getSExtValue());}]>; def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; } def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>; +defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>; def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; } def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; } -def a30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>; +defm a30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>; def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; } def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>; +defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>; def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; } def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; } -def s8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<8, 0>(N->getSExtValue());}]>; +defm s8_0ImmPred : ImmOpPred<[{ return isShiftedInt<8, 0>(N->getSExtValue());}]>; def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; } def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>; +defm u32_0ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>; def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; } def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>; +defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>; def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; } def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>; +defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>; def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; } def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; } -def b15_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<15, 2>(N->getSExtValue());}]>; +defm b15_2ImmPred : ImmOpPred<[{ return isShiftedInt<15, 2>(N->getSExtValue());}]>; def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; } def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>; +defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>; def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; } def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; } -def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 3>(N->getSExtValue());}]>; +defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>; def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; } def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def m32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>; +defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>; def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; } def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u3_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>; +defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>; def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; } def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u1_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>; +defm u1_0ImmPred : ImmOpPred<[{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>; def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; } def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; } -def s31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 1>(N->getSExtValue());}]>; +defm s31_1ImmPred : ImmOpPred<[{ return isShiftedInt<32, 1>(N->getSExtValue());}]>; def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; } def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; } -def s3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<3, 0>(N->getSExtValue());}]>; +defm s3_0ImmPred : ImmOpPred<[{ return isShiftedInt<3, 0>(N->getSExtValue());}]>; def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; } def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; } -def s30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>; +defm s30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>; def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; } def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>; +defm u4_0ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>; def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; } def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; } -def s6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 0>(N->getSExtValue());}]>; +defm s6_0ImmPred : ImmOpPred<[{ return isShiftedInt<6, 0>(N->getSExtValue());}]>; def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; } def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u5_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>; +defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>; def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; } def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; } -def s32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>; +defm s32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>; def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; } def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; } -def s6_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 3>(N->getSExtValue());}]>; +defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>; def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; } def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>; +defm u10_0ImmPred : ImmOpPred<[{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>; def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; } def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>; +defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>; def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; } def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; } -def s4_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 1>(N->getSExtValue());}]>; +defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>; def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; } def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u16_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>; +defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>; def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; } def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u6_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>; +defm u6_1ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>; def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; } def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u5_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>; +defm u5_2ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>; def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; } def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>; +defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>; def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; } def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u6_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>; +defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>; def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; } def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u7_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>; +defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>; def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; } def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; } -def b13_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<13, 2>(N->getSExtValue());}]>; +defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>; def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; } def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u5_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>; +defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>; def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; } def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u2_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>; +defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>; def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; } def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; } -def s4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 2>(N->getSExtValue());}]>; +defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>; def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; } def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; } -def b30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>; +defm b30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>; def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; } def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>; +defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>; def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; } def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } -def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>; +defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>; diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index c1f32e54e98d..0844fb8a8629 100644 --- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -250,7 +250,7 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, unsigned Opc = T1I->getOpcode(); if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf) return false; - unsigned PredR = T1I->getOperand(0).getReg(); + Register PredR = T1I->getOperand(0).getReg(); // Get the layout successor, or 0 if B does not have one. MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B)); @@ -384,8 +384,8 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B) for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) continue; if (!isPredicate(R)) continue; @@ -401,8 +401,8 @@ bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const { for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isUse()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) continue; const MachineInstr *DefI = MRI->getVRegDef(R); // "Undefined" virtual registers are actually defined via IMPLICIT_DEF. @@ -437,7 +437,7 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const { break; if (usesUndefVReg(&MI)) return false; - unsigned DefR = MI.getOperand(0).getReg(); + Register DefR = MI.getOperand(0).getReg(); if (isPredicate(DefR)) return false; } @@ -491,8 +491,8 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs( for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) continue; if (isPredicate(R)) PredDefs++; @@ -798,7 +798,7 @@ unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B, const MCInstrDesc &D = HII->get(Opc); DebugLoc DL = B->findBranchDebugLoc(); - unsigned MuxR = MRI->createVirtualRegister(DRC); + Register MuxR = MRI->createVirtualRegister(DRC); BuildMI(*B, At, DL, D, MuxR) .addReg(PredR) .addReg(TR, 0, TSR) @@ -837,7 +837,7 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB, unsigned MuxR = 0, MuxSR = 0; if (TR && FR) { - unsigned DR = PN->getOperand(0).getReg(); + Register DR = PN->getOperand(0).getReg(); const TargetRegisterClass *RC = MRI->getRegClass(DR); MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC, FP.PredR, TR, TSR, FR, FSR); @@ -988,8 +988,8 @@ void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) { MachineInstr *PN = &*I; assert(PN->getNumOperands() == 3 && "Invalid phi node"); MachineOperand &UO = PN->getOperand(1); - unsigned UseR = UO.getReg(), UseSR = UO.getSubReg(); - unsigned DefR = PN->getOperand(0).getReg(); + Register UseR = UO.getReg(), UseSR = UO.getSubReg(); + Register DefR = PN->getOperand(0).getReg(); unsigned NewR = UseR; if (UseSR) { // MRI.replaceVregUsesWith does not allow to update the subregister, diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index c343e426ac7d..8984ee82960d 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -285,7 +285,7 @@ bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) { } LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) { - assert(TargetRegisterInfo::isVirtualRegister(Reg)); + assert(Register::isVirtualRegister(Reg)); return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub) : MRI->getMaxLaneMaskForVReg(Reg); } @@ -364,7 +364,7 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) { void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM, LiveRange &Range) { - assert(TargetRegisterInfo::isVirtualRegister(Reg)); + assert(Register::isVirtualRegister(Reg)); if (Range.empty()) return; @@ -372,8 +372,8 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM, auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> std::pair<bool,bool> { if (!Op.isReg() || !Op.isDef()) return { false, false }; - unsigned DR = Op.getReg(), DSR = Op.getSubReg(); - if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg) + Register DR = Op.getReg(), DSR = Op.getSubReg(); + if (!Register::isVirtualRegister(DR) || DR != Reg) return { false, false }; LaneBitmask SLM = getLaneMask(DR, DSR); LaneBitmask A = SLM & LM; @@ -551,8 +551,8 @@ void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet, bool Recalc, bool UpdateKills, bool UpdateDeads) { UpdateKills |= UpdateDeads; for (unsigned R : RegSet) { - if (!TargetRegisterInfo::isVirtualRegister(R)) { - assert(TargetRegisterInfo::isPhysicalRegister(R)); + if (!Register::isVirtualRegister(R)) { + assert(Register::isPhysicalRegister(R)); // There shouldn't be any physical registers as operands, except // possibly reserved registers. assert(MRI->isReserved(R)); @@ -579,17 +579,17 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO, using namespace Hexagon; if (SO.isReg()) { - unsigned PhysR; + Register PhysR; RegisterRef RS = SO; - if (TargetRegisterInfo::isVirtualRegister(RS.Reg)) { + if (Register::isVirtualRegister(RS.Reg)) { const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg); assert(VC->begin() != VC->end() && "Empty register class"); PhysR = *VC->begin(); } else { - assert(TargetRegisterInfo::isPhysicalRegister(RS.Reg)); + assert(Register::isPhysicalRegister(RS.Reg)); PhysR = RS.Reg; } - unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub); + Register PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS); switch (TRI->getRegSizeInBits(*RC)) { case 32: @@ -671,7 +671,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI, MachineOperand &MD = MI.getOperand(0); // Definition MachineOperand &MP = MI.getOperand(1); // Predicate register assert(MD.isDef()); - unsigned DR = MD.getReg(), DSR = MD.getSubReg(); + Register DR = MD.getReg(), DSR = MD.getSubReg(); bool ReadUndef = MD.isUndef(); MachineBasicBlock::iterator At = MI; @@ -802,7 +802,7 @@ bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs, // For physical register we would need to check register aliases, etc. // and we don't want to bother with that. It would be of little value // before the actual register rewriting (from virtual to physical). - if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + if (!Register::isVirtualRegister(RR.Reg)) return false; // No redefs for any operand. if (isRefInMap(RR, Defs, Exec_Then)) @@ -954,7 +954,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond, return false; RegisterRef RT(MS); - unsigned PredR = MP.getReg(); + Register PredR = MP.getReg(); MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond); if (!DefI || !isPredicable(DefI)) return false; @@ -999,7 +999,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond, // subregisters are other physical registers, and we are not checking // that. RegisterRef RR = Op; - if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + if (!Register::isVirtualRegister(RR.Reg)) return false; ReferenceMap &Map = Op.isDef() ? Defs : Uses; @@ -1091,7 +1091,7 @@ bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B, } bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) { - if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + if (!Register::isVirtualRegister(RR.Reg)) return false; const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg); if (RC == &Hexagon::IntRegsRegClass) { diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp index f7edc168de4a..d21de8ccb5ab 100644 --- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp +++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp @@ -114,12 +114,11 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) { // First pass - compute the offset of each basic block. for (const MachineBasicBlock &MBB : MF) { - if (MBB.getAlignment()) { + if (MBB.getAlignment() != Align::None()) { // Although we don't know the exact layout of the final code, we need // to account for alignment padding somehow. This heuristic pads each // aligned basic block according to the alignment value. - int ByteAlign = (1u << MBB.getAlignment()) - 1; - InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign); + InstOffset = alignTo(InstOffset, MBB.getAlignment()); } BlockToInstOffset[&MBB] = InstOffset; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 3368ee4fb3b9..bfa3372d7faf 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -303,10 +303,10 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR, if (MO.isFI()) return true; if (MO.isReg()) { - unsigned R = MO.getReg(); + Register R = MO.getReg(); // Virtual registers will need scavenging, which then may require // a stack slot. - if (TargetRegisterInfo::isVirtualRegister(R)) + if (Register::isVirtualRegister(R)) return true; for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S) if (CSR[*S]) @@ -973,8 +973,8 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB, // understand paired registers for cfi_offset. // Eg .cfi_offset r1:0, -64 - unsigned HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi); - unsigned LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo); + Register HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi); + Register LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo); unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true); unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true); auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg, @@ -1377,10 +1377,10 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized( } MFI.setLocalFrameSize(LFS); - unsigned A = MFI.getLocalFrameMaxAlign(); + Align A = MFI.getLocalFrameMaxAlign(); assert(A <= 8 && "Unexpected local frame alignment"); - if (A == 0) - MFI.setLocalFrameMaxAlign(8); + if (A == 1) + MFI.setLocalFrameMaxAlign(Align(8)); MFI.setUseLocalStackAllocationBlock(true); // Set the physical aligned-stack base address register. @@ -1570,13 +1570,13 @@ bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B, const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const { MachineInstr *MI = &*It; DebugLoc DL = MI->getDebugLoc(); - unsigned DstR = MI->getOperand(0).getReg(); - unsigned SrcR = MI->getOperand(1).getReg(); + Register DstR = MI->getOperand(0).getReg(); + Register SrcR = MI->getOperand(1).getReg(); if (!Hexagon::ModRegsRegClass.contains(DstR) || !Hexagon::ModRegsRegClass.contains(SrcR)) return false; - unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR).add(MI->getOperand(1)); BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR) .addReg(TmpR, RegState::Kill); @@ -1595,13 +1595,13 @@ bool HexagonFrameLowering::expandStoreInt(MachineBasicBlock &B, DebugLoc DL = MI->getDebugLoc(); unsigned Opc = MI->getOpcode(); - unsigned SrcR = MI->getOperand(2).getReg(); + Register SrcR = MI->getOperand(2).getReg(); bool IsKill = MI->getOperand(2).isKill(); int FI = MI->getOperand(0).getIndex(); // TmpR = C2_tfrpr SrcR if SrcR is a predicate register // TmpR = A2_tfrcrr SrcR if SrcR is a modifier register - unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); unsigned TfrOpc = (Opc == Hexagon::STriw_pred) ? Hexagon::C2_tfrpr : Hexagon::A2_tfrcrr; BuildMI(B, It, DL, HII.get(TfrOpc), TmpR) @@ -1628,11 +1628,11 @@ bool HexagonFrameLowering::expandLoadInt(MachineBasicBlock &B, DebugLoc DL = MI->getDebugLoc(); unsigned Opc = MI->getOpcode(); - unsigned DstR = MI->getOperand(0).getReg(); + Register DstR = MI->getOperand(0).getReg(); int FI = MI->getOperand(1).getIndex(); // TmpR = L2_loadri_io FI, 0 - unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(B, It, DL, HII.get(Hexagon::L2_loadri_io), TmpR) .addFrameIndex(FI) .addImm(0) @@ -1658,7 +1658,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B, return false; DebugLoc DL = MI->getDebugLoc(); - unsigned SrcR = MI->getOperand(2).getReg(); + Register SrcR = MI->getOperand(2).getReg(); bool IsKill = MI->getOperand(2).isKill(); int FI = MI->getOperand(0).getIndex(); auto *RC = &Hexagon::HvxVRRegClass; @@ -1667,8 +1667,8 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B, // TmpR0 = A2_tfrsi 0x01010101 // TmpR1 = V6_vandqrt Qx, TmpR0 // store FI, 0, TmpR1 - unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); - unsigned TmpR1 = MRI.createVirtualRegister(RC); + Register TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register TmpR1 = MRI.createVirtualRegister(RC); BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0) .addImm(0x01010101); @@ -1695,15 +1695,15 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B, return false; DebugLoc DL = MI->getDebugLoc(); - unsigned DstR = MI->getOperand(0).getReg(); + Register DstR = MI->getOperand(0).getReg(); int FI = MI->getOperand(1).getIndex(); auto *RC = &Hexagon::HvxVRRegClass; // TmpR0 = A2_tfrsi 0x01010101 // TmpR1 = load FI, 0 // DstR = V6_vandvrt TmpR1, TmpR0 - unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); - unsigned TmpR1 = MRI.createVirtualRegister(RC); + Register TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register TmpR1 = MRI.createVirtualRegister(RC); BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0) .addImm(0x01010101); @@ -1745,9 +1745,9 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B, } DebugLoc DL = MI->getDebugLoc(); - unsigned SrcR = MI->getOperand(2).getReg(); - unsigned SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo); - unsigned SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi); + Register SrcR = MI->getOperand(2).getReg(); + Register SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo); + Register SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi); bool IsKill = MI->getOperand(2).isKill(); int FI = MI->getOperand(0).getIndex(); @@ -1793,9 +1793,9 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B, return false; DebugLoc DL = MI->getDebugLoc(); - unsigned DstR = MI->getOperand(0).getReg(); - unsigned DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi); - unsigned DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo); + Register DstR = MI->getOperand(0).getReg(); + Register DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi); + Register DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo); int FI = MI->getOperand(1).getIndex(); unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass); @@ -1834,7 +1834,7 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B, auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); DebugLoc DL = MI->getDebugLoc(); - unsigned SrcR = MI->getOperand(2).getReg(); + Register SrcR = MI->getOperand(2).getReg(); bool IsKill = MI->getOperand(2).isKill(); int FI = MI->getOperand(0).getIndex(); @@ -1863,7 +1863,7 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B, auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); DebugLoc DL = MI->getDebugLoc(); - unsigned DstR = MI->getOperand(0).getReg(); + Register DstR = MI->getOperand(0).getReg(); int FI = MI->getOperand(1).getIndex(); unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass); @@ -2299,7 +2299,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF, int TFI; if (!HII.isLoadFromStackSlot(MI, TFI) || TFI != FI) continue; - unsigned DstR = MI.getOperand(0).getReg(); + Register DstR = MI.getOperand(0).getReg(); assert(MI.getOperand(0).getSubReg() == 0); MachineInstr *CopyOut = nullptr; if (DstR != FoundR) { diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 65e8c7686640..27265dd53794 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -30,7 +30,7 @@ class TargetRegisterClass; class HexagonFrameLowering : public TargetFrameLowering { public: explicit HexagonFrameLowering() - : TargetFrameLowering(StackGrowsDown, 8, 0, 1, true) {} + : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align::None(), true) {} // All of the prolog/epilog functionality, including saving and restoring // callee-saved registers is handled in emitPrologue. This is to have the diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp index 3417c74e359b..caa0e4d80397 100644 --- a/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -184,7 +184,7 @@ bool HexagonGenExtract::convert(Instruction *In) { // The width of the extracted field is the minimum of the original bits // that remain after the shifts and the number of contiguous 1s in the mask. uint32_t W = std::min(U, T); - if (W == 0) + if (W == 0 || W == 1) return false; // Check if the extracted bits are contained within the mask that it is diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp index 81025c1c5325..48881e02f4d3 100644 --- a/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -163,11 +163,11 @@ namespace { } static inline unsigned v2x(unsigned v) { - return TargetRegisterInfo::virtReg2Index(v); + return Register::virtReg2Index(v); } static inline unsigned x2v(unsigned x) { - return TargetRegisterInfo::index2VirtReg(x); + return Register::index2VirtReg(x); } }; @@ -267,7 +267,7 @@ namespace { CellMapShadow(const BitTracker &T) : BT(T) {} const BitTracker::RegisterCell &lookup(unsigned VR) { - unsigned RInd = TargetRegisterInfo::virtReg2Index(VR); + unsigned RInd = Register::virtReg2Index(VR); // Grow the vector to at least 32 elements. if (RInd >= CVect.size()) CVect.resize(std::max(RInd+16, 32U), nullptr); @@ -606,9 +606,9 @@ void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const { for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { const MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef()) { - unsigned R = MO.getReg(); + Register R = MO.getReg(); assert(MO.getSubReg() == 0 && "Unexpected subregister in definition"); - if (TargetRegisterInfo::isVirtualRegister(R)) + if (Register::isVirtualRegister(R)) RO.insert(std::make_pair(R, Index++)); } } @@ -724,8 +724,8 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI, const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isDef()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) continue; Defs.insert(R); } @@ -737,8 +737,8 @@ void HexagonGenInsert::getInstrUses(const MachineInstr *MI, const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isUse()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) continue; Uses.insert(R); } @@ -1399,7 +1399,7 @@ bool HexagonGenInsert::generateInserts() { for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { unsigned VR = I->first; const TargetRegisterClass *RC = MRI->getRegClass(VR); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); RegMap[VR] = NewVR; } @@ -1477,9 +1477,8 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) { for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R) || - !MRI->use_nodbg_empty(R)) { + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R) || !MRI->use_nodbg_empty(R)) { AllDead = false; break; } @@ -1598,7 +1597,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { IterListType Out; for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { - unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first); + unsigned Idx = Register::virtReg2Index(I->first); if (Idx >= Cutoff) Out.push_back(I); } diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp index cdafbc20ab86..b559e7bbbb60 100644 --- a/lib/Target/Hexagon/HexagonGenMux.cpp +++ b/lib/Target/Hexagon/HexagonGenMux.cpp @@ -171,7 +171,7 @@ void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs, for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.isImplicit()) continue; - unsigned R = MO.getReg(); + Register R = MO.getReg(); BitVector &Set = MO.isDef() ? Defs : Uses; expandReg(R, Set); } @@ -239,14 +239,14 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { unsigned Opc = MI->getOpcode(); if (!isCondTransfer(Opc)) continue; - unsigned DR = MI->getOperand(0).getReg(); + Register DR = MI->getOperand(0).getReg(); if (isRegPair(DR)) continue; MachineOperand &PredOp = MI->getOperand(1); if (PredOp.isUndef()) continue; - unsigned PR = PredOp.getReg(); + Register PR = PredOp.getReg(); unsigned Idx = I2X.lookup(MI); CondsetMap::iterator F = CM.find(DR); bool IfTrue = HII->isPredicatedTrue(Opc); diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp index e991fa8b61c8..24d33c91a29b 100644 --- a/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -133,7 +133,7 @@ INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred", "Hexagon generate predicate operations", false, false) bool HexagonGenPredicate::isPredReg(unsigned R) { - if (!TargetRegisterInfo::isVirtualRegister(R)) + if (!Register::isVirtualRegister(R)) return false; const TargetRegisterClass *RC = MRI->getRegClass(R); return RC == &Hexagon::PredRegsRegClass; @@ -213,7 +213,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) { case TargetOpcode::COPY: if (isPredReg(MI->getOperand(1).getReg())) { RegisterSubReg RD = MI->getOperand(0); - if (TargetRegisterInfo::isVirtualRegister(RD.R)) + if (Register::isVirtualRegister(RD.R)) PredGPRs.insert(RD); } break; @@ -245,7 +245,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) { // Create a predicate register for a given Reg. The newly created register // will have its value copied from Reg, so that it can be later used as // an operand in other instructions. - assert(TargetRegisterInfo::isVirtualRegister(Reg.R)); + assert(Register::isVirtualRegister(Reg.R)); RegToRegMap::iterator F = G2P.find(Reg); if (F != G2P.end()) return F->second; @@ -265,7 +265,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) { MachineBasicBlock &B = *DefI->getParent(); DebugLoc DL = DefI->getDebugLoc(); const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass; - unsigned NewPR = MRI->createVirtualRegister(PredRC); + Register NewPR = MRI->createVirtualRegister(PredRC); // For convertible instructions, do not modify them, so that they can // be converted later. Generate a copy from Reg to NewPR. @@ -432,7 +432,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR // with NewGPR. const TargetRegisterClass *RC = MRI->getRegClass(OutR.R); - unsigned NewOutR = MRI->createVirtualRegister(RC); + Register NewOutR = MRI->createVirtualRegister(RC); BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR) .addReg(NewPR.R, 0, NewPR.S); MRI->replaceRegWith(OutR.R, NewOutR); @@ -471,9 +471,9 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) { continue; RegisterSubReg DR = MI.getOperand(0); RegisterSubReg SR = MI.getOperand(1); - if (!TargetRegisterInfo::isVirtualRegister(DR.R)) + if (!Register::isVirtualRegister(DR.R)) continue; - if (!TargetRegisterInfo::isVirtualRegister(SR.R)) + if (!Register::isVirtualRegister(SR.R)) continue; if (MRI->getRegClass(DR.R) != PredRC) continue; diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp index cecbaedb6d70..62291790f0fe 100644 --- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -435,17 +435,17 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L, if (Phi->getOperand(i+1).getMBB() != Latch) continue; - unsigned PhiOpReg = Phi->getOperand(i).getReg(); + Register PhiOpReg = Phi->getOperand(i).getReg(); MachineInstr *DI = MRI->getVRegDef(PhiOpReg); if (DI->getDesc().isAdd()) { // If the register operand to the add is the PHI we're looking at, this // meets the induction pattern. - unsigned IndReg = DI->getOperand(1).getReg(); + Register IndReg = DI->getOperand(1).getReg(); MachineOperand &Opnd2 = DI->getOperand(2); int64_t V; if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) { - unsigned UpdReg = DI->getOperand(0).getReg(); + Register UpdReg = DI->getOperand(0).getReg(); IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V))); } } @@ -694,7 +694,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, Cmp = Comparison::getSwappedComparison(Cmp); if (InitialValue->isReg()) { - unsigned R = InitialValue->getReg(); + Register R = InitialValue->getReg(); MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent(); if (!MDT->properlyDominates(DefBB, Header)) { int64_t V; @@ -704,7 +704,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L, OldInsts.push_back(MRI->getVRegDef(R)); } if (EndValue->isReg()) { - unsigned R = EndValue->getReg(); + Register R = EndValue->getReg(); MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent(); if (!MDT->properlyDominates(DefBB, Header)) { int64_t V; @@ -910,7 +910,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, (RegToImm ? TII->get(Hexagon::A2_subri) : TII->get(Hexagon::A2_addi)); if (RegToReg || RegToImm) { - unsigned SubR = MRI->createVirtualRegister(IntRC); + Register SubR = MRI->createVirtualRegister(IntRC); MachineInstrBuilder SubIB = BuildMI(*PH, InsertPos, DL, SubD, SubR); @@ -931,7 +931,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, EndValInstr->getOperand(2).getImm() == StartV) { DistR = EndValInstr->getOperand(1).getReg(); } else { - unsigned SubR = MRI->createVirtualRegister(IntRC); + Register SubR = MRI->createVirtualRegister(IntRC); MachineInstrBuilder SubIB = BuildMI(*PH, InsertPos, DL, SubD, SubR); SubIB.addReg(End->getReg(), 0, End->getSubReg()) @@ -950,7 +950,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, AdjSR = DistSR; } else { // Generate CountR = ADD DistR, AdjVal - unsigned AddR = MRI->createVirtualRegister(IntRC); + Register AddR = MRI->createVirtualRegister(IntRC); MCInstrDesc const &AddD = TII->get(Hexagon::A2_addi); BuildMI(*PH, InsertPos, DL, AddD, AddR) .addReg(DistR, 0, DistSR) @@ -971,7 +971,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, unsigned Shift = Log2_32(IVBump); // Generate NormR = LSR DistR, Shift. - unsigned LsrR = MRI->createVirtualRegister(IntRC); + Register LsrR = MRI->createVirtualRegister(IntRC); const MCInstrDesc &LsrD = TII->get(Hexagon::S2_lsr_i_r); BuildMI(*PH, InsertPos, DL, LsrD, LsrR) .addReg(AdjR, 0, AdjSR) @@ -1038,7 +1038,7 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI, if (!MO.isReg() || !MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MRI->use_nodbg_empty(Reg)) continue; @@ -1058,7 +1058,7 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI, if (!OPO.isReg() || !OPO.isDef()) continue; - unsigned OPReg = OPO.getReg(); + Register OPReg = OPO.getReg(); use_nodbg_iterator nextJ; for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg); J != End; J = nextJ) { @@ -1092,7 +1092,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); MachineRegisterInfo::use_iterator nextI; for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg), E = MRI->use_end(); I != E; I = nextI) { @@ -1244,7 +1244,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L, if (TripCount->isReg()) { // Create a copy of the loop count register. - unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); + Register CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg) .addReg(TripCount->getReg(), 0, TripCount->getSubReg()); // Add the Loop instruction to the beginning of the loop. @@ -1257,7 +1257,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L, // create a new virtual register. int64_t CountImm = TripCount->getImm(); if (!TII->isValidOffset(LOOP_i, CountImm, TRI)) { - unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); + Register CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg) .addImm(CountImm); BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r)) @@ -1333,7 +1333,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI, return true; // Out of order. - unsigned PredR = CmpI->getOperand(0).getReg(); + Register PredR = CmpI->getOperand(0).getReg(); bool FoundBump = false; instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt); for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) { @@ -1428,10 +1428,10 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow( if (checkForImmediate(*InitVal, Imm)) return (EndVal->getImm() == Imm); - unsigned Reg = InitVal->getReg(); + Register Reg = InitVal->getReg(); // We don't know the value of a physical register. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return true; MachineInstr *Def = MRI->getVRegDef(Reg); @@ -1508,8 +1508,8 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO, // processed to handle potential subregisters in MO. int64_t TV; - unsigned R = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = MO.getReg(); + if (!Register::isVirtualRegister(R)) return false; MachineInstr *DI = MRI->getVRegDef(R); unsigned DOpc = DI->getOpcode(); @@ -1582,11 +1582,11 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) { } assert(MO.isReg()); - unsigned R = MO.getReg(); + Register R = MO.getReg(); MachineInstr *DI = MRI->getVRegDef(R); const TargetRegisterClass *RC = MRI->getRegClass(R); - unsigned NewR = MRI->createVirtualRegister(RC); + Register NewR = MRI->createVirtualRegister(RC); MachineBasicBlock &B = *DI->getParent(); DebugLoc DL = DI->getDebugLoc(); BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR).addImm(Val); @@ -1634,17 +1634,17 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { if (Phi->getOperand(i+1).getMBB() != Latch) continue; - unsigned PhiReg = Phi->getOperand(i).getReg(); + Register PhiReg = Phi->getOperand(i).getReg(); MachineInstr *DI = MRI->getVRegDef(PhiReg); if (DI->getDesc().isAdd()) { // If the register operand to the add/sub is the PHI we are looking // at, this meets the induction pattern. - unsigned IndReg = DI->getOperand(1).getReg(); + Register IndReg = DI->getOperand(1).getReg(); MachineOperand &Opnd2 = DI->getOperand(2); int64_t V; if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) { - unsigned UpdReg = DI->getOperand(0).getReg(); + Register UpdReg = DI->getOperand(0).getReg(); IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V))); } } @@ -1702,7 +1702,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { if (!Cond[CSz-1].isReg()) return false; - unsigned P = Cond[CSz-1].getReg(); + Register P = Cond[CSz - 1].getReg(); MachineInstr *PredDef = MRI->getVRegDef(P); if (!PredDef->isCompare()) @@ -1903,15 +1903,15 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL); NewPH->insert(NewPH->end(), NewPN); - unsigned PR = PN->getOperand(0).getReg(); + Register PR = PN->getOperand(0).getReg(); const TargetRegisterClass *RC = MRI->getRegClass(PR); - unsigned NewPR = MRI->createVirtualRegister(RC); + Register NewPR = MRI->createVirtualRegister(RC); NewPN->addOperand(MachineOperand::CreateReg(NewPR, true)); // Copy all non-latch operands of a header's PHI node to the newly // created PHI node in the preheader. for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) { - unsigned PredR = PN->getOperand(i).getReg(); + Register PredR = PN->getOperand(i).getReg(); unsigned PredRSub = PN->getOperand(i).getSubReg(); MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB(); if (PredB == Latch) diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 605fcfc25559..4684d8e4781a 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -697,7 +697,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { // void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) { SDLoc dl(N); - ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N); + auto *CN = cast<ConstantFPSDNode>(N); APInt A = CN->getValueAPF().bitcastToAPInt(); if (N->getValueType(0) == MVT::f32) { SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i32); diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index fef5a98cdb00..8a8986e232a0 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -240,12 +240,12 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return true; } -unsigned HexagonTargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { +Register HexagonTargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &) const { // Just support r19, the linux kernel uses it. - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<Register>(RegName) .Case("r19", Hexagon::R19) - .Default(0); + .Default(Register()); if (Reg) return Reg; @@ -286,7 +286,7 @@ SDValue HexagonTargetLowering::LowerCallResult( SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(), MVT::i32, Glue); // FR0 = (Value, Chain, Glue) - unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); + Register PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR, FR0.getValue(0), FR0.getValue(2)); // TPR = (Chain, Glue) @@ -736,7 +736,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments( RegVT = VA.getValVT(); const TargetRegisterClass *RC = getRegClassFor(RegVT); - unsigned VReg = MRI.createVirtualRegister(RC); + Register VReg = MRI.createVirtualRegister(RC); SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); // Treat values of type MVT::i1 specially: they are passed in @@ -870,15 +870,20 @@ SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue PredOp = Op.getOperand(0); SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2); - EVT OpVT = Op1.getValueType(); - SDLoc DL(Op); + MVT OpTy = ty(Op1); + const SDLoc &dl(Op); - if (OpVT == MVT::v2i16) { - SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1); - SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2); - SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2); - SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL); - return TR; + if (OpTy == MVT::v2i16 || OpTy == MVT::v4i8) { + MVT ElemTy = OpTy.getVectorElementType(); + assert(ElemTy.isScalarInteger()); + MVT WideTy = MVT::getVectorVT(MVT::getIntegerVT(2*ElemTy.getSizeInBits()), + OpTy.getVectorNumElements()); + // Generate (trunc (select (_, sext, sext))). + return DAG.getSExtOrTrunc( + DAG.getSelect(dl, WideTy, PredOp, + DAG.getSExtOrTrunc(Op1, dl, WideTy), + DAG.getSExtOrTrunc(Op2, dl, WideTy)), + dl, OpTy); } return SDValue(); @@ -1230,9 +1235,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, Subtarget(ST) { auto &HRI = *Subtarget.getRegisterInfo(); - setPrefLoopAlignment(4); - setPrefFunctionAlignment(4); - setMinFunctionAlignment(2); + setPrefLoopAlignment(Align(16)); + setMinFunctionAlignment(Align(4)); + setPrefFunctionAlignment(Align(16)); setStackPointerRegisterToSaveRestore(HRI.getStackRegister()); setBooleanContents(TargetLoweringBase::UndefinedBooleanContent); setBooleanVectorContents(TargetLoweringBase::UndefinedBooleanContent); @@ -1434,12 +1439,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, ISD::CONCAT_VECTORS, ISD::VECTOR_SHUFFLE }; - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { for (unsigned VectExpOp : VectExpOps) setOperationAction(VectExpOp, VT, Expand); // Expand all extending loads and truncating stores: - for (MVT TargetVT : MVT::vector_valuetypes()) { + for (MVT TargetVT : MVT::fixedlen_vector_valuetypes()) { if (TargetVT == VT) continue; setLoadExtAction(ISD::EXTLOAD, TargetVT, VT, Expand); @@ -1496,16 +1501,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, VT, Custom); } - for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i32, MVT::v4i16, MVT::v2i32}) { - setCondCodeAction(ISD::SETLT, VT, Expand); + for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v8i8, MVT::v2i32, MVT::v4i16, + MVT::v2i32}) { + setCondCodeAction(ISD::SETNE, VT, Expand); setCondCodeAction(ISD::SETLE, VT, Expand); - setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETGE, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); } // Custom-lower bitcasts from i8 to v8i1. setOperationAction(ISD::BITCAST, MVT::i8, Custom); setOperationAction(ISD::SETCC, MVT::v2i16, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i8, Custom); setOperationAction(ISD::VSELECT, MVT::v2i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); @@ -1554,6 +1564,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f64, Legal); } + setTargetDAGCombine(ISD::VSELECT); + if (Subtarget.useHVXOps()) initializeHVXLowering(); @@ -1643,6 +1655,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0"; case HexagonISD::VROR: return "HexagonISD::VROR"; case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE"; + case HexagonISD::PTRUE: return "HexagonISD::PTRUE"; + case HexagonISD::PFALSE: return "HexagonISD::PFALSE"; case HexagonISD::VZERO: return "HexagonISD::VZERO"; case HexagonISD::VSPLATW: return "HexagonISD::VSPLATW"; case HexagonISD::D2P: return "HexagonISD::D2P"; @@ -1783,7 +1797,8 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // The offset value comes through Modifier register. For now, assume the // offset is 0. Info.offset = 0; - Info.align = DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont)); + Info.align = + MaybeAlign(DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont))); Info.flags = MachineMemOperand::MOLoad; return true; } @@ -1805,7 +1820,8 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(VecTy); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = M.getDataLayout().getTypeAllocSizeInBits(VecTy) / 8; + Info.align = + MaybeAlign(M.getDataLayout().getTypeAllocSizeInBits(VecTy) / 8); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; @@ -1817,6 +1833,10 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return false; } +bool HexagonTargetLowering::hasBitTest(SDValue X, SDValue Y) const { + return X.getValueType().isScalarInteger(); // 'tstbit' +} + bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2)); } @@ -1844,26 +1864,33 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, TargetLoweringBase::LegalizeTypeAction HexagonTargetLowering::getPreferredVectorAction(MVT VT) const { - if (VT.getVectorNumElements() == 1) - return TargetLoweringBase::TypeScalarizeVector; - - // Always widen vectors of i1. + unsigned VecLen = VT.getVectorNumElements(); MVT ElemTy = VT.getVectorElementType(); - if (ElemTy == MVT::i1) - return TargetLoweringBase::TypeWidenVector; + + if (VecLen == 1 || VT.isScalableVector()) + return TargetLoweringBase::TypeScalarizeVector; if (Subtarget.useHVXOps()) { + unsigned HwLen = Subtarget.getVectorLength(); // If the size of VT is at least half of the vector length, // widen the vector. Note: the threshold was not selected in // any scientific way. ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes(); if (llvm::find(Tys, ElemTy) != Tys.end()) { - unsigned HwWidth = 8*Subtarget.getVectorLength(); + unsigned HwWidth = 8*HwLen; unsigned VecWidth = VT.getSizeInBits(); if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) return TargetLoweringBase::TypeWidenVector; } + // Split vectors of i1 that correspond to (byte) vector pairs. + if (ElemTy == MVT::i1 && VecLen == 2*HwLen) + return TargetLoweringBase::TypeSplitVector; } + + // Always widen (remaining) vectors of i1. + if (ElemTy == MVT::i1) + return TargetLoweringBase::TypeWidenVector; + return TargetLoweringBase::TypeSplitVector; } @@ -2452,6 +2479,23 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return buildVector64(Ops, dl, VecTy, DAG); if (VecTy == MVT::v8i1 || VecTy == MVT::v4i1 || VecTy == MVT::v2i1) { + // Check if this is a special case or all-0 or all-1. + bool All0 = true, All1 = true; + for (SDValue P : Ops) { + auto *CN = dyn_cast<ConstantSDNode>(P.getNode()); + if (CN == nullptr) { + All0 = All1 = false; + break; + } + uint32_t C = CN->getZExtValue(); + All0 &= (C == 0); + All1 &= (C == 1); + } + if (All0) + return DAG.getNode(HexagonISD::PFALSE, dl, VecTy); + if (All1) + return DAG.getNode(HexagonISD::PTRUE, dl, VecTy); + // For each i1 element in the resulting predicate register, put 1 // shifted by the index of the element into a general-purpose register, // then or them together and transfer it back into a predicate register. @@ -2629,7 +2673,8 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) DoDefault = true; if (!AlignLoads) { - if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), *LN->getMemOperand())) + if (allowsMemoryAccessForAlignment(Ctx, DL, LN->getMemoryVT(), + *LN->getMemOperand())) return Op; DoDefault = true; } @@ -2637,7 +2682,8 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) // The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)". MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8 * HaveAlign) : MVT::getVectorVT(MVT::i8, HaveAlign); - DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, *LN->getMemOperand()); + DoDefault = + allowsMemoryAccessForAlignment(Ctx, DL, PartTy, *LN->getMemOperand()); } if (DoDefault) { std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG); @@ -2865,12 +2911,54 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N, if (N->getValueType(0) == MVT::i8) { SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32, N->getOperand(0), DAG); - Results.push_back(P); + SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8); + Results.push_back(T); } break; } } +SDValue +HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) + const { + SDValue Op(N, 0); + if (isHvxOperation(Op)) { + if (SDValue V = PerformHvxDAGCombine(N, DCI)) + return V; + return SDValue(); + } + + const SDLoc &dl(Op); + unsigned Opc = Op.getOpcode(); + + if (Opc == HexagonISD::P2D) { + SDValue P = Op.getOperand(0); + switch (P.getOpcode()) { + case HexagonISD::PTRUE: + return DCI.DAG.getConstant(-1, dl, ty(Op)); + case HexagonISD::PFALSE: + return getZero(dl, ty(Op), DCI.DAG); + default: + break; + } + } else if (Opc == ISD::VSELECT) { + // This is pretty much duplicated in HexagonISelLoweringHVX... + // + // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0) + SDValue Cond = Op.getOperand(0); + if (Cond->getOpcode() == ISD::XOR) { + SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); + if (C1->getOpcode() == HexagonISD::PTRUE) { + SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, + Op.getOperand(2), Op.getOperand(1)); + return VSel; + } + } + } + + return SDValue(); +} + /// Returns relocation base for the given PIC jumptable. SDValue HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table, diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 4e467cb22727..75f553bfec7f 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -68,6 +68,8 @@ namespace HexagonISD { EH_RETURN, DCFETCH, READCYCLE, + PTRUE, + PFALSE, D2P, // Convert 8-byte value to 8-bit predicate register. [*] P2D, // Convert 8-bit predicate register to 8-byte value. [*] V2Q, // Convert HVX vector to a vector predicate reg. [*] @@ -127,6 +129,8 @@ namespace HexagonISD { bool isCheapToSpeculateCtlz() const override { return true; } bool isCtlzFast() const override { return true; } + bool hasBitTest(SDValue X, SDValue Y) const override; + bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; /// Return true if an FMA operation is faster than a pair of mul and add @@ -221,10 +225,12 @@ namespace HexagonISD { const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. @@ -299,7 +305,8 @@ namespace HexagonISD { const AttributeList &FuncAttributes) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, - unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override; + unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) + const override; /// Returns relocation base for the given PIC jumptable. SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) @@ -456,6 +463,8 @@ namespace HexagonISD { bool isHvxOperation(SDValue Op) const; SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const; + + SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 345c657787a0..bc8a9959c917 100644 --- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -193,6 +193,8 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::OR, BoolV, Legal); setOperationAction(ISD::XOR, BoolV, Legal); } + + setTargetDAGCombine(ISD::VSELECT); } SDValue @@ -1580,6 +1582,28 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Unhandled HVX operation"); } +SDValue +HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) + const { + const SDLoc &dl(N); + SDValue Op(N, 0); + + unsigned Opc = Op.getOpcode(); + if (Opc == ISD::VSELECT) { + // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0) + SDValue Cond = Op.getOperand(0); + if (Cond->getOpcode() == ISD::XOR) { + SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); + if (C1->getOpcode() == HexagonISD::QTRUE) { + SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, + Op.getOperand(2), Op.getOperand(1)); + return VSel; + } + } + } + return SDValue(); +} + bool HexagonTargetLowering::isHvxOperation(SDValue Op) const { // If the type of the result, or any operand type are HVX vector types, diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index a156de5ba128..767538f92ed6 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -193,7 +193,7 @@ static inline void parseOperands(const MachineInstr &MI, if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg) continue; @@ -674,86 +674,96 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB, return 2; } -/// Analyze the loop code to find the loop induction variable and compare used -/// to compute the number of iterations. Currently, we analyze loop that are -/// controlled using hardware loops. In this case, the induction variable -/// instruction is null. For all other cases, this function returns true, which -/// means we're unable to analyze it. -bool HexagonInstrInfo::analyzeLoop(MachineLoop &L, - MachineInstr *&IndVarInst, - MachineInstr *&CmpInst) const { +namespace { +class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { + MachineInstr *Loop, *EndLoop; + MachineFunction *MF; + const HexagonInstrInfo *TII; + int64_t TripCount; + Register LoopCount; + DebugLoc DL; - MachineBasicBlock *LoopEnd = L.getBottomBlock(); - MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator(); - // We really "analyze" only hardware loops right now. - if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) { - IndVarInst = nullptr; - CmpInst = &*I; - return false; +public: + HexagonPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop) + : Loop(Loop), EndLoop(EndLoop), MF(Loop->getParent()->getParent()), + TII(MF->getSubtarget<HexagonSubtarget>().getInstrInfo()), + DL(Loop->getDebugLoc()) { + // Inspect the Loop instruction up-front, as it may be deleted when we call + // createTripCountGreaterCondition. + TripCount = Loop->getOpcode() == Hexagon::J2_loop0r + ? -1 + : Loop->getOperand(1).getImm(); + if (TripCount == -1) + LoopCount = Loop->getOperand(1).getReg(); } - return true; -} -/// Generate code to reduce the loop iteration by one and check if the loop is -/// finished. Return the value/register of the new loop count. this function -/// assumes the nth iteration is peeled first. -unsigned HexagonInstrInfo::reduceLoopCount( - MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar, - MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond, - SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter, - unsigned MaxIter) const { - // We expect a hardware loop currently. This means that IndVar is set - // to null, and the compare is the ENDLOOP instruction. - assert((!IndVar) && isEndLoopN(Cmp.getOpcode()) - && "Expecting a hardware loop"); - MachineFunction *MF = MBB.getParent(); - DebugLoc DL = Cmp.getDebugLoc(); - SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; - MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(), - Cmp.getOperand(0).getMBB(), VisitedBBs); - if (!Loop) - return 0; - // If the loop trip count is a compile-time value, then just change the - // value. - if (Loop->getOpcode() == Hexagon::J2_loop0i || - Loop->getOpcode() == Hexagon::J2_loop1i) { - int64_t Offset = Loop->getOperand(1).getImm(); - if (Offset <= 1) - Loop->eraseFromParent(); - else - Loop->getOperand(1).setImm(Offset - 1); - return Offset - 1; + bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { + // Only ignore the terminator. + return MI == EndLoop; } - // The loop trip count is a run-time value. We generate code to subtract - // one from the trip count, and update the loop instruction. - assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction"); - unsigned LoopCount = Loop->getOperand(1).getReg(); - // Check if we're done with the loop. - unsigned LoopEnd = createVR(MF, MVT::i1); - MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd). - addReg(LoopCount).addImm(1); - unsigned NewLoopCount = createVR(MF, MVT::i32); - MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount). - addReg(LoopCount).addImm(-1); - const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo(); - // Update the previously generated instructions with the new loop counter. - for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(), - E = PrevInsts.end(); I != E; ++I) - (*I)->substituteRegister(LoopCount, NewLoopCount, 0, HRI); - PrevInsts.clear(); - PrevInsts.push_back(NewCmp); - PrevInsts.push_back(NewAdd); - // Insert the new loop instruction if this is the last time the loop is - // decremented. - if (Iter == MaxIter) - BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)). - addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount); - // Delete the old loop instruction. - if (Iter == 0) - Loop->eraseFromParent(); - Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf)); - Cond.push_back(NewCmp->getOperand(0)); - return NewLoopCount; + + Optional<bool> + createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB, + SmallVectorImpl<MachineOperand> &Cond) override { + if (TripCount == -1) { + // Check if we're done with the loop. + unsigned Done = TII->createVR(MF, MVT::i1); + MachineInstr *NewCmp = BuildMI(&MBB, DL, + TII->get(Hexagon::C2_cmpgtui), Done) + .addReg(LoopCount) + .addImm(TC); + Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf)); + Cond.push_back(NewCmp->getOperand(0)); + return {}; + } + + return TripCount > TC; + } + + void setPreheader(MachineBasicBlock *NewPreheader) override { + NewPreheader->splice(NewPreheader->getFirstTerminator(), Loop->getParent(), + Loop); + } + + void adjustTripCount(int TripCountAdjust) override { + // If the loop trip count is a compile-time value, then just change the + // value. + if (Loop->getOpcode() == Hexagon::J2_loop0i || + Loop->getOpcode() == Hexagon::J2_loop1i) { + int64_t TripCount = Loop->getOperand(1).getImm() + TripCountAdjust; + assert(TripCount > 0 && "Can't create an empty or negative loop!"); + Loop->getOperand(1).setImm(TripCount); + return; + } + + // The loop trip count is a run-time value. We generate code to subtract + // one from the trip count, and update the loop instruction. + Register LoopCount = Loop->getOperand(1).getReg(); + Register NewLoopCount = TII->createVR(MF, MVT::i32); + BuildMI(*Loop->getParent(), Loop, Loop->getDebugLoc(), + TII->get(Hexagon::A2_addi), NewLoopCount) + .addReg(LoopCount) + .addImm(TripCountAdjust); + Loop->getOperand(1).setReg(NewLoopCount); + } + + void disposed() override { Loop->eraseFromParent(); } +}; +} // namespace + +std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> +HexagonInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + // We really "analyze" only hardware loops right now. + MachineBasicBlock::iterator I = LoopBB->getFirstTerminator(); + + if (I != LoopBB->end() && isEndLoopN(I->getOpcode())) { + SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; + MachineInstr *LoopInst = findLoopInstr( + LoopBB, I->getOpcode(), I->getOperand(0).getMBB(), VisitedBBs); + if (LoopInst) + return std::make_unique<HexagonPipelinerLoopInfo>(LoopInst, &*I); + } + return nullptr; } bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, @@ -839,8 +849,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } if (Hexagon::HvxWRRegClass.contains(SrcReg, DestReg)) { - unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo); - unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi); + Register LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo); + Register HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi); BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg) .addReg(HiSrc, KillFlag) .addReg(LoSrc, KillFlag); @@ -1017,7 +1027,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); auto RealCirc = [&](unsigned Opc, bool HasImm, unsigned MxOp) { - unsigned Mx = MI.getOperand(MxOp).getReg(); + Register Mx = MI.getOperand(MxOp).getReg(); unsigned CSx = (Mx == Hexagon::M0 ? Hexagon::CS0 : Hexagon::CS1); BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrrcr), CSx) .add(MI.getOperand((HasImm ? 5 : 4))); @@ -1049,8 +1059,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MBB.erase(MI); return true; case Hexagon::V6_vassignp: { - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); unsigned Kill = getKillRegState(MI.getOperand(1).isKill()); BuildMI(MBB, MI, DL, get(Hexagon::V6_vcombine), DstReg) .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_hi), Kill) @@ -1059,18 +1069,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case Hexagon::V6_lo: { - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo); copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI.getOperand(1).isKill()); MBB.erase(MI); MRI.clearKillFlags(SrcSubLo); return true; } case Hexagon::V6_hi: { - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi); copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI.getOperand(1).isKill()); MBB.erase(MI); MRI.clearKillFlags(SrcSubHi); @@ -1079,9 +1089,9 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case Hexagon::PS_vstorerw_ai: case Hexagon::PS_vstorerwu_ai: { bool Aligned = Opc == Hexagon::PS_vstorerw_ai; - unsigned SrcReg = MI.getOperand(2).getReg(); - unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi); - unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo); + Register SrcReg = MI.getOperand(2).getReg(); + Register SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi); + Register SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo); unsigned NewOpc = Aligned ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32Ub_ai; unsigned Offset = HRI.getSpillSize(Hexagon::HvxVRRegClass); @@ -1103,7 +1113,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case Hexagon::PS_vloadrw_ai: case Hexagon::PS_vloadrwu_ai: { bool Aligned = Opc == Hexagon::PS_vloadrw_ai; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); unsigned NewOpc = Aligned ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32Ub_ai; unsigned Offset = HRI.getSpillSize(Hexagon::HvxVRRegClass); @@ -1122,7 +1132,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case Hexagon::PS_true: { - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg) .addReg(Reg, RegState::Undef) .addReg(Reg, RegState::Undef); @@ -1130,7 +1140,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case Hexagon::PS_false: { - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg) .addReg(Reg, RegState::Undef) .addReg(Reg, RegState::Undef); @@ -1152,7 +1162,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case Hexagon::PS_vdd0: { - unsigned Vd = MI.getOperand(0).getReg(); + Register Vd = MI.getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(Hexagon::V6_vsubw_dv), Vd) .addReg(Vd, RegState::Undef) .addReg(Vd, RegState::Undef); @@ -1161,13 +1171,13 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case Hexagon::PS_vmulw: { // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies. - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned Src1Reg = MI.getOperand(1).getReg(); - unsigned Src2Reg = MI.getOperand(2).getReg(); - unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi); - unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo); - unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi); - unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo); + Register DstReg = MI.getOperand(0).getReg(); + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + Register Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi); + Register Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo); + Register Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi); + Register Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo); BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi), HRI.getSubReg(DstReg, Hexagon::isub_hi)) .addReg(Src1SubHi) @@ -1185,16 +1195,16 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case Hexagon::PS_vmulw_acc: { // Expand 64-bit vector multiply with addition into 2 scalar multiplies. - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned Src1Reg = MI.getOperand(1).getReg(); - unsigned Src2Reg = MI.getOperand(2).getReg(); - unsigned Src3Reg = MI.getOperand(3).getReg(); - unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi); - unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo); - unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi); - unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo); - unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi); - unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo); + Register DstReg = MI.getOperand(0).getReg(); + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + Register Src3Reg = MI.getOperand(3).getReg(); + Register Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi); + Register Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo); + Register Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi); + Register Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo); + Register Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi); + Register Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo); BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci), HRI.getSubReg(DstReg, Hexagon::isub_hi)) .addReg(Src1SubHi) @@ -1219,10 +1229,10 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const MachineOperand &Op1 = MI.getOperand(1); const MachineOperand &Op2 = MI.getOperand(2); const MachineOperand &Op3 = MI.getOperand(3); - unsigned Rd = Op0.getReg(); - unsigned Pu = Op1.getReg(); - unsigned Rs = Op2.getReg(); - unsigned Rt = Op3.getReg(); + Register Rd = Op0.getReg(); + Register Pu = Op1.getReg(); + Register Rs = Op2.getReg(); + Register Rt = Op3.getReg(); DebugLoc DL = MI.getDebugLoc(); unsigned K1 = getKillRegState(Op1.isKill()); unsigned K2 = getKillRegState(Op2.isKill()); @@ -1246,7 +1256,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); - unsigned PReg = Op1.getReg(); + Register PReg = Op1.getReg(); assert(Op1.getSubReg() == 0); unsigned PState = getRegState(Op1); @@ -1280,15 +1290,15 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); - unsigned PReg = Op1.getReg(); + Register PReg = Op1.getReg(); assert(Op1.getSubReg() == 0); unsigned PState = getRegState(Op1); if (Op0.getReg() != Op2.getReg()) { unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill : PState; - unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo); - unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi); + Register SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo); + Register SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi); auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine)) .add(Op0) .addReg(PReg, S) @@ -1299,8 +1309,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { IsDestLive = true; } if (Op0.getReg() != Op3.getReg()) { - unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo); - unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi); + Register SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo); + Register SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi); auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine)) .add(Op0) .addReg(PReg, PState) @@ -1856,8 +1866,7 @@ DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState( // S2_storeri_io %r29, 132, killed %r1; flags: mem:ST4[FixedStack1] // Currently AA considers the addresses in these instructions to be aliasing. bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint( - const MachineInstr &MIa, const MachineInstr &MIb, - AliasAnalysis *AA) const { + const MachineInstr &MIa, const MachineInstr &MIb) const { if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; @@ -1872,7 +1881,7 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint( if (!getBaseAndOffsetPosition(MIa, BasePosA, OffsetPosA)) return false; const MachineOperand &BaseA = MIa.getOperand(BasePosA); - unsigned BaseRegA = BaseA.getReg(); + Register BaseRegA = BaseA.getReg(); unsigned BaseSubA = BaseA.getSubReg(); // Get the base register in MIb. @@ -1880,7 +1889,7 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint( if (!getBaseAndOffsetPosition(MIb, BasePosB, OffsetPosB)) return false; const MachineOperand &BaseB = MIb.getOperand(BasePosB); - unsigned BaseRegB = BaseB.getReg(); + Register BaseRegB = BaseB.getReg(); unsigned BaseSubB = BaseB.getSubReg(); if (BaseRegA != BaseRegB || BaseSubA != BaseSubB) @@ -1984,7 +1993,7 @@ unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const { llvm_unreachable("Cannot handle this register class"); } - unsigned NewReg = MRI.createVirtualRegister(TRC); + Register NewReg = MRI.createVirtualRegister(TRC); return NewReg; } @@ -2094,12 +2103,12 @@ bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI, if (RegA == RegB) return true; - if (TargetRegisterInfo::isPhysicalRegister(RegA)) + if (Register::isPhysicalRegister(RegA)) for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs) if (RegB == *SubRegs) return true; - if (TargetRegisterInfo::isPhysicalRegister(RegB)) + if (Register::isPhysicalRegister(RegB)) for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs) if (RegA == *SubRegs) return true; @@ -2605,7 +2614,7 @@ bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1, const MachineInstr &MI2) const { if (mayBeCurLoad(MI1)) { // if (result of SU is used in Next) return true; - unsigned DstReg = MI1.getOperand(0).getReg(); + Register DstReg = MI1.getOperand(0).getReg(); int N = MI2.getNumOperands(); for (int I = 0; I < N; I++) if (MI2.getOperand(I).isReg() && DstReg == MI2.getOperand(I).getReg()) @@ -3374,7 +3383,7 @@ unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA, if ((GA.getOpcode() != Hexagon::C2_cmpeqi) || (GB.getOpcode() != Hexagon::J2_jumptnew)) return -1u; - unsigned DestReg = GA.getOperand(0).getReg(); + Register DestReg = GA.getOperand(0).getReg(); if (!GB.readsRegister(DestReg)) return -1u; if (DestReg != Hexagon::P0 && DestReg != Hexagon::P1) @@ -4091,7 +4100,7 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // Get DefIdx and UseIdx for super registers. const MachineOperand &DefMO = DefMI.getOperand(DefIdx); - if (DefMO.isReg() && HRI.isPhysicalRegister(DefMO.getReg())) { + if (DefMO.isReg() && Register::isPhysicalRegister(DefMO.getReg())) { if (DefMO.isImplicit()) { for (MCSuperRegIterator SR(DefMO.getReg(), &HRI); SR.isValid(); ++SR) { int Idx = DefMI.findRegisterDefOperandIdx(*SR, false, false, &HRI); diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index e0a999d0f4c4..60298cd666bb 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -129,21 +129,10 @@ public: const DebugLoc &DL, int *BytesAdded = nullptr) const override; - /// Analyze the loop code, return true if it cannot be understood. Upon - /// success, this function returns false and returns information about the - /// induction variable and compare instruction used at the end. - bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, - MachineInstr *&CmpInst) const override; - - /// Generate code to reduce the loop iteration by one and check if the loop - /// is finished. Return the value/register of the new loop count. We need - /// this function when peeling off one or more iterations of a loop. This - /// function assumes the nth iteration is peeled first. - unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, - MachineInstr *IndVar, MachineInstr &Cmp, - SmallVectorImpl<MachineOperand> &Cond, - SmallVectorImpl<MachineInstr *> &PrevInsts, - unsigned Iter, unsigned MaxIter) const override; + /// Analyze loop L, which must be a single-basic-block loop, and if the + /// conditions can be understood enough produce a PipelinerLoopInfo object. + std::unique_ptr<PipelinerLoopInfo> + analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; /// Return true if it's profitable to predicate /// instructions with accumulated instruction latency of "NumCycles" @@ -299,8 +288,7 @@ public: // memory addresses and false otherwise. bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; /// For instructions with a base and offset, return the position of the /// base register and offset operands. diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index cabfd783effa..c5e3cfd080d6 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -22,14 +22,14 @@ class T_RP_pat <InstHexagon MI, Intrinsic IntID> def: Pat<(int_hexagon_A2_add IntRegs:$Rs, IntRegs:$Rt), (A2_add IntRegs:$Rs, IntRegs:$Rt)>; -def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, imm:$s16), +def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, timm:$s16), (A2_addi IntRegs:$Rs, imm:$s16)>; def: Pat<(int_hexagon_A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt), (A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt)>; def: Pat<(int_hexagon_A2_sub IntRegs:$Rs, IntRegs:$Rt), (A2_sub IntRegs:$Rs, IntRegs:$Rt)>; -def: Pat<(int_hexagon_A2_subri imm:$s10, IntRegs:$Rs), +def: Pat<(int_hexagon_A2_subri timm:$s10, IntRegs:$Rs), (A2_subri imm:$s10, IntRegs:$Rs)>; def: Pat<(int_hexagon_A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt), (A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt)>; @@ -45,26 +45,26 @@ def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt), def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt), (M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt)>; -def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, imm:$u5), +def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, timm:$u5), (S2_asl_i_r IntRegs:$Rs, imm:$u5)>; -def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, imm:$u5), +def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, timm:$u5), (S2_lsr_i_r IntRegs:$Rs, imm:$u5)>; -def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, imm:$u5), +def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, timm:$u5), (S2_asr_i_r IntRegs:$Rs, imm:$u5)>; -def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, imm:$u6), +def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, timm:$u6), (S2_asl_i_p DoubleRegs:$Rs, imm:$u6)>; -def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, imm:$u6), +def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, timm:$u6), (S2_lsr_i_p DoubleRegs:$Rs, imm:$u6)>; -def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, imm:$u6), +def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, timm:$u6), (S2_asr_i_p DoubleRegs:$Rs, imm:$u6)>; def: Pat<(int_hexagon_A2_and IntRegs:$Rs, IntRegs:$Rt), (A2_and IntRegs:$Rs, IntRegs:$Rt)>; -def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, imm:$s10), +def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, timm:$s10), (A2_andir IntRegs:$Rs, imm:$s10)>; def: Pat<(int_hexagon_A2_or IntRegs:$Rs, IntRegs:$Rt), (A2_or IntRegs:$Rs, IntRegs:$Rt)>; -def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, imm:$s10), +def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, timm:$s10), (A2_orir IntRegs:$Rs, imm:$s10)>; def: Pat<(int_hexagon_A2_xor IntRegs:$Rs, IntRegs:$Rt), (A2_xor IntRegs:$Rs, IntRegs:$Rt)>; @@ -99,13 +99,13 @@ def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, (i32 0)), (S2_vsathub I64:$Rs)>; } -def : Pat <(int_hexagon_S2_asr_i_r_rnd_goodsyntax I32:$Rs, u5_0ImmPred:$imm), +def : Pat <(int_hexagon_S2_asr_i_r_rnd_goodsyntax I32:$Rs, u5_0ImmPred_timm:$imm), (S2_asr_i_r_rnd I32:$Rs, (UDEC1 u5_0ImmPred:$imm))>; -def : Pat <(int_hexagon_S2_asr_i_p_rnd_goodsyntax I64:$Rs, u6_0ImmPred:$imm), +def : Pat <(int_hexagon_S2_asr_i_p_rnd_goodsyntax I64:$Rs, u6_0ImmPred_timm:$imm), (S2_asr_i_p_rnd I64:$Rs, (UDEC1 u6_0ImmPred:$imm))>; -def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred:$imm), +def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred_timm:$imm), (S5_vasrhrnd I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>; -def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred:$imm), +def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred_timm:$imm), (S5_asrhub_rnd_sat I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>; def ImmExt64: SDNodeXForm<imm, [{ @@ -121,13 +121,13 @@ def ImmExt64: SDNodeXForm<imm, [{ // To connect the builtin with the instruction, the builtin's operand // needs to be extended to the right type. -def : Pat<(int_hexagon_A2_tfrpi imm:$Is), +def : Pat<(int_hexagon_A2_tfrpi timm:$Is), (A2_tfrpi (ImmExt64 $Is))>; -def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2), +def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred_timm:$src2), (C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>; -def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred:$src2), +def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred_timm:$src2), (C2_tfrpr (C2_cmpgtui I32:$src1, (UDEC1 u32_0ImmPred:$src2)))>; def : Pat <(int_hexagon_C2_cmpgeui I32:$src, 0), @@ -142,7 +142,7 @@ def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2), //===----------------------------------------------------------------------===// class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst, SDNodeXForm XformImm> - : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred:$src3, u5_0ImmPred:$src4), + : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4), (OutputInst I32:$src1, I32:$src2, u4_0ImmPred:$src3, (XformImm u5_0ImmPred:$src4))>; @@ -197,11 +197,11 @@ class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val> : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s), (MI I32:$Rs, Imm:$s, I32:$Ru, Val:$Rt)>; -def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb, s4_0ImmPred, I32>; -def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth, s4_1ImmPred, I32>; -def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw, s4_2ImmPred, I32>; -def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std, s4_3ImmPred, I64>; -def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>; +def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb, s4_0ImmPred_timm, I32>; +def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth, s4_1ImmPred_timm, I32>; +def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw, s4_2ImmPred_timm, I32>; +def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std, s4_3ImmPred_timm, I64>; +def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred_timm, I32>; multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> { def : Pat<(IntID HvxQR:$src1, IntRegs:$src2, HvxVR:$src3), diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index ac48e1dc30b0..bda3eccac0cd 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -93,9 +93,9 @@ static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom", cl::Hidden, cl::init(true), cl::desc("Only enable generating memmove in non-nested loops")); -cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", - cl::Hidden, cl::init(false), - cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); +static cl::opt<bool> HexagonVolatileMemcpy( + "disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), + cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000), cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR")); @@ -632,9 +632,9 @@ Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) { if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero()) continue; Value *IterV = PN->getIncomingValueForBlock(BB); - if (!isa<BinaryOperator>(IterV)) - continue; auto *BO = dyn_cast<BinaryOperator>(IterV); + if (!BO) + continue; if (BO->getOpcode() != Instruction::Add) continue; Value *IncV = nullptr; @@ -2020,7 +2020,7 @@ bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop, // See if the pointer expression is an AddRec like {base,+,1} on the current // loop, which indicates a strided load. If we have something else, it's a // random load we can't handle. - LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); + auto *LI = cast<LoadInst>(SI->getValueOperand()); auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); // The trip count of the loop and the base pointer of the addrec SCEV is @@ -2426,7 +2426,8 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { DL = &L->getHeader()->getModule()->getDataLayout(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); HasMemcpy = TLI->has(LibFunc_memcpy); diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp index db44901ca706..680d01e12af0 100644 --- a/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -177,7 +177,7 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII, (II->getOperand(i).isUse() || II->getOperand(i).isDef())) { MachineBasicBlock::iterator localII = II; ++localII; - unsigned Reg = II->getOperand(i).getReg(); + Register Reg = II->getOperand(i).getReg(); for (MachineBasicBlock::iterator localBegin = localII; localBegin != end; ++localBegin) { if (localBegin == skip) @@ -290,7 +290,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII, // at machine code level, we don't need this, but if we decide // to move new value jump prior to RA, we would be needing this. MachineRegisterInfo &MRI = MF.getRegInfo(); - if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) { + if (secondReg && !Register::isPhysicalRegister(cmpOp2)) { MachineInstr *def = MRI.getVRegDef(cmpOp2); if (def->getOpcode() == TargetOpcode::COPY) return false; @@ -516,7 +516,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { jmpPos = MII; jmpInstr = &MI; predReg = MI.getOperand(0).getReg(); - afterRA = TargetRegisterInfo::isPhysicalRegister(predReg); + afterRA = Register::isPhysicalRegister(predReg); // If ifconverter had not messed up with the kill flags of the // operands, the following check on the kill flag would suffice. @@ -603,7 +603,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { (isSecondOpReg && MI.getOperand(0).getReg() == (unsigned)cmpOp2))) { - unsigned feederReg = MI.getOperand(0).getReg(); + Register feederReg = MI.getOperand(0).getReg(); // First try to see if we can get the feeder from the first operand // of the compare. If we can not, and if secondOpReg is true @@ -651,7 +651,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { for (MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isUse()) continue; - unsigned UseR = MO.getReg(); + Register UseR = MO.getReg(); for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) { if (I == cmpPos) continue; diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 547da9fd598f..9121115020a2 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -162,7 +162,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, if (!OffsetOp.isImm() || OffsetOp.getImm() > 3) return false; - unsigned OffsetReg = MI.getOperand(2).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); RegisterRef OffsetRR; NodeId OffsetRegRD = 0; for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) { @@ -348,7 +348,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN, MachineInstr *AddMI, const NodeList &UNodeList) { - unsigned AddDefR = AddMI->getOperand(0).getReg(); + Register AddDefR = AddMI->getOperand(0).getReg(); for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) { NodeAddr<UseNode *> UN = *I; NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG); @@ -381,7 +381,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN, // Ex: Rx= add(Rt,#10) // memw(Rx+#0) = Rs // will be replaced with => memw(Rt+#10) = Rs - unsigned BaseReg = AddMI->getOperand(1).getReg(); + Register BaseReg = AddMI->getOperand(1).getReg(); if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList)) return false; } @@ -411,7 +411,7 @@ bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI, MachineInstr *UseMI) { const MachineOperand ImmOp = AddMI->getOperand(2); const MachineOperand AddRegOp = AddMI->getOperand(1); - unsigned newReg = AddRegOp.getReg(); + Register newReg = AddRegOp.getReg(); const MCInstrDesc &MID = UseMI->getDesc(); MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1) @@ -543,7 +543,7 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum) { bool Changed = false; - unsigned OpStart; + unsigned OpStart = 0; unsigned OpEnd = OldMI->getNumOperands(); MachineBasicBlock *BB = OldMI->getParent(); auto UsePos = MachineBasicBlock::iterator(OldMI); @@ -724,7 +724,7 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) { } short SizeInc = 0; - unsigned DefR = MI->getOperand(0).getReg(); + Register DefR = MI->getOperand(0).getReg(); InstrEvalMap InstrEvalResult; // Analyze all uses and calculate increase in size. Perform the optimization diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index fb731f56bfbf..485e658e1c84 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -99,13 +99,21 @@ def HWI8: PatLeaf<(VecPI8 HvxWR:$R)>; def HWI16: PatLeaf<(VecPI16 HvxWR:$R)>; def HWI32: PatLeaf<(VecPI32 HvxWR:$R)>; +def SDTVecLeaf: + SDTypeProfile<1, 0, [SDTCisVec<0>]>; def SDTVecVecIntOp: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>, SDTCisVT<3,i32>]>; +def HexagonPTRUE: SDNode<"HexagonISD::PTRUE", SDTVecLeaf>; +def HexagonPFALSE: SDNode<"HexagonISD::PFALSE", SDTVecLeaf>; def HexagonVALIGN: SDNode<"HexagonISD::VALIGN", SDTVecVecIntOp>; def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>; +def ptrue: PatFrag<(ops), (HexagonPTRUE)>; +def pfalse: PatFrag<(ops), (HexagonPFALSE)>; +def pnot: PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>; + def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru), (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>; def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>; @@ -154,6 +162,11 @@ def IsNPow2_64H: PatLeaf<(i64 imm), [{ return isPowerOf2_64(NV) && Log2_64(NV) >= 32; }]>; +class IsULE<int Width, int Arg>: PatLeaf<(i32 imm), + "uint64_t V = N->getZExtValue();" # + "return isUInt<" # Width # ">(V) && V <= " # Arg # ";" +>; + class IsUGT<int Width, int Arg>: PatLeaf<(i32 imm), "uint64_t V = N->getZExtValue();" # "return isUInt<" # Width # ">(V) && V > " # Arg # ";" @@ -320,6 +333,24 @@ multiclass SelMinMax_pats<PatFrag CmpOp, PatFrag Val, (InstB Val:$A, Val:$B)>; } +multiclass MinMax_pats<InstHexagon PickT, InstHexagon PickS, + PatFrag Sel, PatFrag CmpOp, + ValueType CmpType, PatFrag CmpPred> { + def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)), + CmpPred:$Vt, CmpPred:$Vs), + (PickT CmpPred:$Vs, CmpPred:$Vt)>; + def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)), + CmpPred:$Vs, CmpPred:$Vt), + (PickS CmpPred:$Vs, CmpPred:$Vt)>; +} + +// Bitcasts between same-size vector types are no-ops, except for the +// actual type change. +multiclass NopCast_pat<ValueType Ty1, ValueType Ty2, RegisterClass RC> { + def: Pat<(Ty1 (bitconvert (Ty2 RC:$Val))), (Ty1 RC:$Val)>; + def: Pat<(Ty2 (bitconvert (Ty1 RC:$Val))), (Ty2 RC:$Val)>; +} + // Frags for commonly used SDNodes. def Add: pf2<add>; def And: pf2<and>; def Sra: pf2<sra>; @@ -403,17 +434,18 @@ def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>; def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>; def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>; -multiclass Cast_pat<ValueType Ta, ValueType Tb, RegisterClass RC> { - def: Pat<(Tb (bitconvert (Ta RC:$Rs))), (Tb RC:$Rs)>; - def: Pat<(Ta (bitconvert (Tb RC:$Rs))), (Ta RC:$Rs)>; -} - -// Bit convert vector types to integers. -defm: Cast_pat<v4i8, i32, IntRegs>; -defm: Cast_pat<v2i16, i32, IntRegs>; -defm: Cast_pat<v8i8, i64, DoubleRegs>; -defm: Cast_pat<v4i16, i64, DoubleRegs>; -defm: Cast_pat<v2i32, i64, DoubleRegs>; +// Bit convert 32- and 64-bit types. +// All of these are bitcastable to one another: i32, v2i16, v4i8. +defm: NopCast_pat<i32, v2i16, IntRegs>; +defm: NopCast_pat<i32, v4i8, IntRegs>; +defm: NopCast_pat<v2i16, v4i8, IntRegs>; +// All of these are bitcastable to one another: i64, v2i32, v4i16, v8i8. +defm: NopCast_pat<i64, v2i32, DoubleRegs>; +defm: NopCast_pat<i64, v4i16, DoubleRegs>; +defm: NopCast_pat<i64, v8i8, DoubleRegs>; +defm: NopCast_pat<v2i32, v4i16, DoubleRegs>; +defm: NopCast_pat<v2i32, v8i8, DoubleRegs>; +defm: NopCast_pat<v4i16, v8i8, DoubleRegs>; // --(3) Extend/truncate ------------------------------------------------- @@ -497,7 +529,9 @@ def: Pat<(v2i16 (trunc V2I32:$Rs)), // def: Pat<(not I1:$Ps), (C2_not I1:$Ps)>; -def: Pat<(not V8I1:$Ps), (C2_not V8I1:$Ps)>; +def: Pat<(pnot V2I1:$Ps), (C2_not V2I1:$Ps)>; +def: Pat<(pnot V4I1:$Ps), (C2_not V4I1:$Ps)>; +def: Pat<(pnot V8I1:$Ps), (C2_not V8I1:$Ps)>; def: Pat<(add I1:$Ps, -1), (C2_not I1:$Ps)>; multiclass BoolOpR_RR_pat<InstHexagon MI, PatFrag Op> { @@ -816,14 +850,6 @@ def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs), def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I), (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>; -def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt), - (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>; -def: Pat<(select I1:$Pu, V2I16:$Rs, V2I16:$Rt), - (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>; -def: Pat<(select I1:$Pu, V2I32:$Rs, V2I32:$Rt), - (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)), - (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>; - def: Pat<(vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt), (C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>; def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt), @@ -831,6 +857,14 @@ def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt), def: Pat<(vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt), (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>; +def: Pat<(vselect (pnot V8I1:$Pu), V8I8:$Rs, V8I8:$Rt), + (C2_vmux V8I1:$Pu, V8I8:$Rt, V8I8:$Rs)>; +def: Pat<(vselect (pnot V4I1:$Pu), V4I16:$Rs, V4I16:$Rt), + (C2_vmux V4I1:$Pu, V4I16:$Rt, V4I16:$Rs)>; +def: Pat<(vselect (pnot V2I1:$Pu), V2I32:$Rs, V2I32:$Rt), + (C2_vmux V2I1:$Pu, V2I32:$Rt, V2I32:$Rs)>; + + // From LegalizeDAG.cpp: (Pu ? Pv : Pw) <=> (Pu & Pv) | (!Pu & Pw). def: Pat<(select I1:$Pu, I1:$Pv, I1:$Pw), (C2_or (C2_and I1:$Pu, I1:$Pv), @@ -863,32 +897,44 @@ let AddedComplexity = 200 in { } let AddedComplexity = 200 in { - defm: SelMinMax_pats<setge, I32, A2_max, A2_min>; - defm: SelMinMax_pats<setgt, I32, A2_max, A2_min>; - defm: SelMinMax_pats<setle, I32, A2_min, A2_max>; - defm: SelMinMax_pats<setlt, I32, A2_min, A2_max>; - defm: SelMinMax_pats<setuge, I32, A2_maxu, A2_minu>; - defm: SelMinMax_pats<setugt, I32, A2_maxu, A2_minu>; - defm: SelMinMax_pats<setule, I32, A2_minu, A2_maxu>; - defm: SelMinMax_pats<setult, I32, A2_minu, A2_maxu>; + defm: MinMax_pats<A2_min, A2_max, select, setgt, i1, I32>; + defm: MinMax_pats<A2_min, A2_max, select, setge, i1, I32>; + defm: MinMax_pats<A2_max, A2_min, select, setlt, i1, I32>; + defm: MinMax_pats<A2_max, A2_min, select, setle, i1, I32>; + defm: MinMax_pats<A2_minu, A2_maxu, select, setugt, i1, I32>; + defm: MinMax_pats<A2_minu, A2_maxu, select, setuge, i1, I32>; + defm: MinMax_pats<A2_maxu, A2_minu, select, setult, i1, I32>; + defm: MinMax_pats<A2_maxu, A2_minu, select, setule, i1, I32>; - defm: SelMinMax_pats<setge, I64, A2_maxp, A2_minp>; - defm: SelMinMax_pats<setgt, I64, A2_maxp, A2_minp>; - defm: SelMinMax_pats<setle, I64, A2_minp, A2_maxp>; - defm: SelMinMax_pats<setlt, I64, A2_minp, A2_maxp>; - defm: SelMinMax_pats<setuge, I64, A2_maxup, A2_minup>; - defm: SelMinMax_pats<setugt, I64, A2_maxup, A2_minup>; - defm: SelMinMax_pats<setule, I64, A2_minup, A2_maxup>; - defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>; + defm: MinMax_pats<A2_minp, A2_maxp, select, setgt, i1, I64>; + defm: MinMax_pats<A2_minp, A2_maxp, select, setge, i1, I64>; + defm: MinMax_pats<A2_maxp, A2_minp, select, setlt, i1, I64>; + defm: MinMax_pats<A2_maxp, A2_minp, select, setle, i1, I64>; + defm: MinMax_pats<A2_minup, A2_maxup, select, setugt, i1, I64>; + defm: MinMax_pats<A2_minup, A2_maxup, select, setuge, i1, I64>; + defm: MinMax_pats<A2_maxup, A2_minup, select, setult, i1, I64>; + defm: MinMax_pats<A2_maxup, A2_minup, select, setule, i1, I64>; } let AddedComplexity = 100 in { - defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>; - defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>; - defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>; - defm: SelMinMax_pats<setoge, F32, F2_sfmax, F2_sfmin>; + defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setogt, i1, F32>; + defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setoge, i1, F32>; + defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setolt, i1, F32>; + defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setole, i1, F32>; } +defm: MinMax_pats<A2_vminb, A2_vmaxb, vselect, setgt, v8i1, V8I8>; +defm: MinMax_pats<A2_vminb, A2_vmaxb, vselect, setge, v8i1, V8I8>; +defm: MinMax_pats<A2_vminh, A2_vmaxh, vselect, setgt, v4i1, V4I16>; +defm: MinMax_pats<A2_vminh, A2_vmaxh, vselect, setge, v4i1, V4I16>; +defm: MinMax_pats<A2_vminw, A2_vmaxw, vselect, setgt, v2i1, V2I32>; +defm: MinMax_pats<A2_vminw, A2_vmaxw, vselect, setge, v2i1, V2I32>; +defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setugt, v8i1, V8I8>; +defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setuge, v8i1, V8I8>; +defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setugt, v4i1, V4I16>; +defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setuge, v4i1, V4I16>; +defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setugt, v2i1, V2I32>; +defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setuge, v2i1, V2I32>; // --(7) Insert/extract -------------------------------------------------- // @@ -1639,19 +1685,19 @@ def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)), // // Count leading zeros. -def: Pat<(ctlz I32:$Rs), (S2_cl0 I32:$Rs)>; +def: Pat<(i32 (ctlz I32:$Rs)), (S2_cl0 I32:$Rs)>; def: Pat<(i32 (trunc (ctlz I64:$Rss))), (S2_cl0p I64:$Rss)>; // Count trailing zeros. -def: Pat<(cttz I32:$Rs), (S2_ct0 I32:$Rs)>; +def: Pat<(i32 (cttz I32:$Rs)), (S2_ct0 I32:$Rs)>; def: Pat<(i32 (trunc (cttz I64:$Rss))), (S2_ct0p I64:$Rss)>; // Count leading ones. -def: Pat<(ctlz (not I32:$Rs)), (S2_cl1 I32:$Rs)>; +def: Pat<(i32 (ctlz (not I32:$Rs))), (S2_cl1 I32:$Rs)>; def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>; // Count trailing ones. -def: Pat<(cttz (not I32:$Rs)), (S2_ct1 I32:$Rs)>; +def: Pat<(i32 (cttz (not I32:$Rs))), (S2_ct1 I32:$Rs)>; def: Pat<(i32 (trunc (cttz (not I64:$Rss)))), (S2_ct1p I64:$Rss)>; // Define leading/trailing patterns that require zero-extensions to 64 bits. @@ -1706,6 +1752,7 @@ let AddedComplexity = 20 in { // Complexity greater than and/or/xor (i32 (LoReg $Rss)))>; } + let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm. def: Pat<(i1 (setne (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)), (S2_tstbit_i IntRegs:$Rs, imm:$u5)>; @@ -1717,6 +1764,20 @@ let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm. (S2_tstbit_i (LoReg DoubleRegs:$Rs), 0)>; } +def: Pat<(and (srl I32:$Rs, u5_0ImmPred:$u5), 1), + (I1toI32 (S2_tstbit_i I32:$Rs, imm:$u5))>; +def: Pat<(and (srl I64:$Rss, IsULE<32,31>:$u6), 1), + (ToZext64 (I1toI32 (S2_tstbit_i (LoReg $Rss), imm:$u6)))>; +def: Pat<(and (srl I64:$Rss, IsUGT<32,31>:$u6), 1), + (ToZext64 (I1toI32 (S2_tstbit_i (HiReg $Rss), (UDEC32 $u6))))>; + +def: Pat<(and (not (srl I32:$Rs, u5_0ImmPred:$u5)), 1), + (I1toI32 (S4_ntstbit_i I32:$Rs, imm:$u5))>; +def: Pat<(and (not (srl I64:$Rss, IsULE<32,31>:$u6)), 1), + (ToZext64 (I1toI32 (S4_ntstbit_i (LoReg $Rss), imm:$u6)))>; +def: Pat<(and (not (srl I64:$Rss, IsUGT<32,31>:$u6)), 1), + (ToZext64 (I1toI32 (S4_ntstbit_i (HiReg $Rss), (UDEC32 $u6))))>; + let AddedComplexity = 20 in { // Complexity greater than compare reg-imm. def: Pat<(i1 (seteq (and I32:$Rs, u6_0ImmPred:$u6), 0)), (C2_bitsclri IntRegs:$Rs, imm:$u6)>; @@ -1737,23 +1798,28 @@ def: Pat<(HexagonTSTBIT I32:$Rs, u5_0ImmPred:$u5), def: Pat<(HexagonTSTBIT I32:$Rs, I32:$Rt), (S2_tstbit_r I32:$Rs, I32:$Rt)>; +// Add extra complexity to prefer these instructions over bitsset/bitsclr. +// The reason is that tstbit/ntstbit can be folded into a compound instruction: +// if ([!]tstbit(...)) jump ... let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm. - def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)), - (S4_ntstbit_i I32:$Rs, imm:$u5)>; + def: Pat<(i1 (seteq (and I32:$Rs, IsPow2_32:$u5), 0)), + (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>; + def: Pat<(i1 (setne (and I32:$Rs, IsPow2_32:$u5), 0)), + (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>; def: Pat<(i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)), (S4_ntstbit_r I32:$Rs, I32:$Rt)>; + def: Pat<(i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)), + (S2_tstbit_r I32:$Rs, I32:$Rt)>; } -// Add extra complexity to prefer these instructions over bitsset/bitsclr. -// The reason is that tstbit/ntstbit can be folded into a compound instruction: -// if ([!]tstbit(...)) jump ... -let AddedComplexity = 100 in -def: Pat<(i1 (setne (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))), - (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>; - -let AddedComplexity = 100 in -def: Pat<(i1 (seteq (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))), - (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>; +def: Pat<(i1 (seteq (and I64:$Rs, IsPow2_64L:$u6), 0)), + (S4_ntstbit_i (LoReg $Rs), (Log2_64 $u6))>; +def: Pat<(i1 (seteq (and I64:$Rs, IsPow2_64H:$u6), 0)), + (S4_ntstbit_i (HiReg $Rs), (UDEC32 (i32 (Log2_64 $u6))))>; +def: Pat<(i1 (setne (and I64:$Rs, IsPow2_64L:$u6), 0)), + (S2_tstbit_i (LoReg $Rs), (Log2_32 imm:$u6))>; +def: Pat<(i1 (setne (and I64:$Rs, IsPow2_64H:$u6), 0)), + (S2_tstbit_i (HiReg $Rs), (UDEC32 (i32 (Log2_32 imm:$u6))))>; // Do not increase complexity of these patterns. In the DAG, "cmp i8" may be // represented as a compare against "value & 0xFF", which is an exact match @@ -1773,10 +1839,18 @@ def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)), let AddedComplexity = 100 in { // Avoid A4_rcmp[n]eqi in these cases: + def: Pat<(i32 (zext (i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)))), + (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>; def: Pat<(i32 (zext (i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)))), (I1toI32 (S2_tstbit_r IntRegs:$Rs, IntRegs:$Rt))>; + def: Pat<(i32 (zext (i1 (seteq (and I32:$Rs, IsPow2_32:$u5), 0)))), + (I1toI32 (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5)))>; + def: Pat<(i32 (zext (i1 (setne (and I32:$Rs, IsPow2_32:$u5), 0)))), + (I1toI32 (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5)))>; def: Pat<(i32 (zext (i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)))), - (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>; + (I1toI32 (S4_ntstbit_r I32:$Rs, I32:$Rt))>; + def: Pat<(i32 (zext (i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)))), + (I1toI32 (S2_tstbit_r I32:$Rs, I32:$Rt))>; } // --(11) PIC ------------------------------------------------------------ diff --git a/lib/Target/Hexagon/HexagonPatternsHVX.td b/lib/Target/Hexagon/HexagonPatternsHVX.td index a4cfca9ac7d7..078a7135c55b 100644 --- a/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -1,5 +1,3 @@ -def SDTVecLeaf: - SDTypeProfile<1, 0, [SDTCisVec<0>]>; def SDTVecBinOp: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>; @@ -162,23 +160,14 @@ let Predicates = [UseHVX] in { // Bitcasts between same-size vector types are no-ops, except for the // actual type change. -class Bitcast<ValueType ResTy, ValueType InpTy, RegisterClass RC> - : Pat<(ResTy (bitconvert (InpTy RC:$Val))), (ResTy RC:$Val)>; - let Predicates = [UseHVX] in { - def: Bitcast<VecI8, VecI16, HvxVR>; - def: Bitcast<VecI8, VecI32, HvxVR>; - def: Bitcast<VecI16, VecI8, HvxVR>; - def: Bitcast<VecI16, VecI32, HvxVR>; - def: Bitcast<VecI32, VecI8, HvxVR>; - def: Bitcast<VecI32, VecI16, HvxVR>; + defm: NopCast_pat<VecI8, VecI16, HvxVR>; + defm: NopCast_pat<VecI8, VecI32, HvxVR>; + defm: NopCast_pat<VecI16, VecI32, HvxVR>; - def: Bitcast<VecPI8, VecPI16, HvxWR>; - def: Bitcast<VecPI8, VecPI32, HvxWR>; - def: Bitcast<VecPI16, VecPI8, HvxWR>; - def: Bitcast<VecPI16, VecPI32, HvxWR>; - def: Bitcast<VecPI32, VecPI8, HvxWR>; - def: Bitcast<VecPI32, VecPI16, HvxWR>; + defm: NopCast_pat<VecPI8, VecPI16, HvxWR>; + defm: NopCast_pat<VecPI8, VecPI32, HvxWR>; + defm: NopCast_pat<VecPI16, VecPI32, HvxWR>; } let Predicates = [UseHVX] in { @@ -260,6 +249,21 @@ class Vnot<ValueType VecTy> : PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>; let Predicates = [UseHVX] in { + let AddedComplexity = 220 in { + defm: MinMax_pats<V6_vminb, V6_vmaxb, vselect, setgt, VecQ8, HVI8>; + defm: MinMax_pats<V6_vminb, V6_vmaxb, vselect, setge, VecQ8, HVI8>; + defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setugt, VecQ8, HVI8>; + defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setuge, VecQ8, HVI8>; + defm: MinMax_pats<V6_vminh, V6_vmaxh, vselect, setgt, VecQ16, HVI16>; + defm: MinMax_pats<V6_vminh, V6_vmaxh, vselect, setge, VecQ16, HVI16>; + defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setugt, VecQ16, HVI16>; + defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setuge, VecQ16, HVI16>; + defm: MinMax_pats<V6_vminw, V6_vmaxw, vselect, setgt, VecQ32, HVI32>; + defm: MinMax_pats<V6_vminw, V6_vmaxw, vselect, setge, VecQ32, HVI32>; + } +} + +let Predicates = [UseHVX] in { let AddedComplexity = 200 in { def: Pat<(Vnot<VecI8> HVI8:$Vs), (V6_vnot HvxVR:$Vs)>; def: Pat<(Vnot<VecI16> HVI16:$Vs), (V6_vnot HvxVR:$Vs)>; diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp index 8f761d2d4805..0ccfe64ad1e5 100644 --- a/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/lib/Target/Hexagon/HexagonPeephole.cpp @@ -136,11 +136,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { assert(MI.getNumOperands() == 2); MachineOperand &Dst = MI.getOperand(0); MachineOperand &Src = MI.getOperand(1); - unsigned DstReg = Dst.getReg(); - unsigned SrcReg = Src.getReg(); + Register DstReg = Dst.getReg(); + Register SrcReg = Src.getReg(); // Just handle virtual registers. - if (TargetRegisterInfo::isVirtualRegister(DstReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + if (Register::isVirtualRegister(DstReg) && + Register::isVirtualRegister(SrcReg)) { // Map the following: // %170 = SXTW %166 // PeepholeMap[170] = %166 @@ -157,8 +157,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src2 = MI.getOperand(2); if (Src1.getImm() != 0) continue; - unsigned DstReg = Dst.getReg(); - unsigned SrcReg = Src2.getReg(); + Register DstReg = Dst.getReg(); + Register SrcReg = Src2.getReg(); PeepholeMap[DstReg] = SrcReg; } @@ -174,8 +174,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src2 = MI.getOperand(2); if (Src2.getImm() != 32) continue; - unsigned DstReg = Dst.getReg(); - unsigned SrcReg = Src1.getReg(); + Register DstReg = Dst.getReg(); + Register SrcReg = Src1.getReg(); PeepholeDoubleRegsMap[DstReg] = std::make_pair(*&SrcReg, Hexagon::isub_hi); } @@ -185,11 +185,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { assert(MI.getNumOperands() == 2); MachineOperand &Dst = MI.getOperand(0); MachineOperand &Src = MI.getOperand(1); - unsigned DstReg = Dst.getReg(); - unsigned SrcReg = Src.getReg(); + Register DstReg = Dst.getReg(); + Register SrcReg = Src.getReg(); // Just handle virtual registers. - if (TargetRegisterInfo::isVirtualRegister(DstReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + if (Register::isVirtualRegister(DstReg) && + Register::isVirtualRegister(SrcReg)) { // Map the following: // %170 = NOT_xx %166 // PeepholeMap[170] = %166 @@ -208,10 +208,10 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { if (Src.getSubReg() != Hexagon::isub_lo) continue; - unsigned DstReg = Dst.getReg(); - unsigned SrcReg = Src.getReg(); - if (TargetRegisterInfo::isVirtualRegister(DstReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register DstReg = Dst.getReg(); + Register SrcReg = Src.getReg(); + if (Register::isVirtualRegister(DstReg) && + Register::isVirtualRegister(SrcReg)) { // Try to find in the map. if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) { // Change the 1st operand. @@ -237,12 +237,12 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { bool Done = false; if (QII->isPredicated(MI)) { MachineOperand &Op0 = MI.getOperand(0); - unsigned Reg0 = Op0.getReg(); + Register Reg0 = Op0.getReg(); const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0); if (RC0->getID() == Hexagon::PredRegsRegClassID) { // Handle instructions that have a prediate register in op0 // (most cases of predicable instructions). - if (TargetRegisterInfo::isVirtualRegister(Reg0)) { + if (Register::isVirtualRegister(Reg0)) { // Try to find in the map. if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) { // Change the 1st operand and, flip the opcode. @@ -275,7 +275,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { break; } if (NewOp) { - unsigned PSrc = MI.getOperand(PR).getReg(); + Register PSrc = MI.getOperand(PR).getReg(); if (unsigned POrig = PeepholeMap.lookup(PSrc)) { BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(), QII->get(NewOp), MI.getOperand(0).getReg()) diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 4f5f750e5842..b7171fb14272 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -217,7 +217,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If the offset is not valid, calculate the address in a temporary // register and use it with offset 0. auto &MRI = MF.getRegInfo(); - unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR) .addReg(BP) @@ -249,8 +249,8 @@ bool HexagonRegisterInfo::shouldCoalesce(MachineInstr *MI, if (!SmallSrc && !SmallDst) return true; - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); const SlotIndexes &Indexes = *LIS.getSlotIndexes(); auto HasCall = [&Indexes] (const LiveInterval::Segment &S) { for (SlotIndex I = S.start.getBaseIndex(), E = S.end.getBaseIndex(); diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp index bd4254aea276..f9fb14c190ff 100644 --- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp +++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp @@ -76,18 +76,18 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) { unsigned Opc = MI.getOpcode(); if (Opc == Hexagon::CONST32) { - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); uint64_t ImmValue = MI.getOperand(1).getImm(); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestReg) .addImm(ImmValue); B.erase(&MI); } else if (Opc == Hexagon::CONST64) { - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); int64_t ImmValue = MI.getOperand(1).getImm(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo); - unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi); + Register DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo); + Register DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi); int32_t LowWord = (ImmValue & 0xFFFFFFFF); int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF; diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp index 013eede2d414..55f31c628854 100644 --- a/lib/Target/Hexagon/HexagonSplitDouble.cpp +++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp @@ -210,8 +210,8 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const { for (auto &Op : MI->operands()) { if (!Op.isReg()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = Op.getReg(); + if (!Register::isVirtualRegister(R)) return true; } return false; @@ -224,14 +224,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { unsigned NumRegs = MRI->getNumVirtRegs(); BitVector DoubleRegs(NumRegs); for (unsigned i = 0; i < NumRegs; ++i) { - unsigned R = TargetRegisterInfo::index2VirtReg(i); + unsigned R = Register::index2VirtReg(i); if (MRI->getRegClass(R) == DoubleRC) DoubleRegs.set(i); } BitVector FixedRegs(NumRegs); for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { - unsigned R = TargetRegisterInfo::index2VirtReg(x); + unsigned R = Register::index2VirtReg(x); MachineInstr *DefI = MRI->getVRegDef(R); // In some cases a register may exist, but never be defined or used. // It should never appear anywhere, but mark it as "fixed", just to be @@ -244,7 +244,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { if (FixedRegs[x]) continue; - unsigned R = TargetRegisterInfo::index2VirtReg(x); + unsigned R = Register::index2VirtReg(x); LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~"); USet &Asc = AssocMap[R]; for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end(); @@ -258,14 +258,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { // Skip non-registers or registers with subregisters. if (&MO == &Op || !MO.isReg() || MO.getSubReg()) continue; - unsigned T = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(T)) { + Register T = MO.getReg(); + if (!Register::isVirtualRegister(T)) { FixedRegs.set(x); continue; } if (MRI->getRegClass(T) != DoubleRC) continue; - unsigned u = TargetRegisterInfo::virtReg2Index(T); + unsigned u = Register::virtReg2Index(T); if (FixedRegs[u]) continue; LLVM_DEBUG(dbgs() << ' ' << printReg(T, TRI)); @@ -281,7 +281,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { unsigned NextP = 1; USet Visited; for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { - unsigned R = TargetRegisterInfo::index2VirtReg(x); + unsigned R = Register::index2VirtReg(x); if (Visited.count(R)) continue; // Create a new partition for R. @@ -372,8 +372,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const { case Hexagon::A2_andp: case Hexagon::A2_orp: case Hexagon::A2_xorp: { - unsigned Rs = MI->getOperand(1).getReg(); - unsigned Rt = MI->getOperand(2).getReg(); + Register Rs = MI->getOperand(1).getReg(); + Register Rt = MI->getOperand(2).getReg(); return profit(Rs) + profit(Rt); } @@ -400,7 +400,7 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const { } int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const { - assert(TargetRegisterInfo::isVirtualRegister(Reg)); + assert(Register::isVirtualRegister(Reg)); const MachineInstr *DefI = MRI->getVRegDef(Reg); switch (DefI->getOpcode()) { @@ -499,7 +499,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L, return; assert(Cond[1].isReg() && "Unexpected Cond vector from analyzeBranch"); // Expect a predicate register. - unsigned PR = Cond[1].getReg(); + Register PR = Cond[1].getReg(); assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass); // Get the registers on which the loop controlling compare instruction @@ -535,7 +535,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L, if (!MI.isPHI()) break; const MachineOperand &MD = MI.getOperand(0); - unsigned R = MD.getReg(); + Register R = MD.getReg(); if (MRI->getRegClass(R) == DoubleRC) DP.push_back(R); } @@ -551,7 +551,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L, // Get the output from the add. If it is one of the inputs to the // loop-controlling compare instruction, then R is likely an induc- // tion register. - unsigned T = UseI->getOperand(0).getReg(); + Register T = UseI->getOperand(0).getReg(); if (T == CmpR1 || T == CmpR2) return false; } @@ -603,9 +603,9 @@ void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI, continue; } // For register operands, set the subregister. - unsigned R = Op.getReg(); + Register R = Op.getReg(); unsigned SR = Op.getSubReg(); - bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R); + bool isVirtReg = Register::isVirtualRegister(R); bool isKill = Op.isKill(); if (isVirtReg && MRI->getRegClass(R) == DoubleRC) { isKill = false; @@ -674,7 +674,7 @@ void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI, : MI->getOperand(2).getImm(); MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0); const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg()); - unsigned NewR = MRI->createVirtualRegister(RC); + Register NewR = MRI->createVirtualRegister(RC); assert(!UpdOp.getSubReg() && "Def operand with subreg"); BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR) .addReg(AdrOp.getReg(), RSA) @@ -789,8 +789,8 @@ void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI, UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); assert(F != PairMap.end()); const UUPair &P = F->second; - unsigned LoR = P.first; - unsigned HiR = P.second; + Register LoR = P.first; + Register HiR = P.second; unsigned Opc = MI->getOpcode(); bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p); @@ -813,7 +813,7 @@ void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI, .addReg(Op1.getReg(), RS, HiSR); } else if (S < 32) { const TargetRegisterClass *IntRC = &IntRegsRegClass; - unsigned TmpR = MRI->createVirtualRegister(IntRC); + Register TmpR = MRI->createVirtualRegister(IntRC); // Expansion: // Shift left: DR = shl R, #s // LoR = shl R.lo, #s @@ -953,12 +953,12 @@ void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI, .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR) .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR) .addImm(S); - unsigned TmpR1 = MRI->createVirtualRegister(IntRC); + Register TmpR1 = MRI->createVirtualRegister(IntRC); BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1) .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR) .addImm(S) .addImm(32-S); - unsigned TmpR2 = MRI->createVirtualRegister(IntRC); + Register TmpR2 = MRI->createVirtualRegister(IntRC); BuildMI(B, MI, DL, TII->get(A2_or), TmpR2) .addReg(Op1.getReg(), RS1, HiSR) .addReg(TmpR1); @@ -1002,7 +1002,7 @@ bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI, switch (Opc) { case TargetOpcode::PHI: case TargetOpcode::COPY: { - unsigned DstR = MI->getOperand(0).getReg(); + Register DstR = MI->getOperand(0).getReg(); if (MRI->getRegClass(DstR) == DoubleRC) { createHalfInstr(Opc, MI, PairMap, isub_lo); createHalfInstr(Opc, MI, PairMap, isub_hi); @@ -1079,7 +1079,7 @@ void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI, for (auto &Op : MI->operands()) { if (!Op.isReg() || !Op.isUse() || !Op.getSubReg()) continue; - unsigned R = Op.getReg(); + Register R = Op.getReg(); UUPairMap::const_iterator F = PairMap.find(R); if (F == PairMap.end()) continue; @@ -1104,8 +1104,8 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI, for (auto &Op : MI->operands()) { if (!Op.isReg() || !Op.isUse()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(R)) + Register R = Op.getReg(); + if (!Register::isVirtualRegister(R)) continue; if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg()) continue; @@ -1113,7 +1113,7 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI, if (F == PairMap.end()) continue; const UUPair &Pr = F->second; - unsigned NewDR = MRI->createVirtualRegister(DoubleRC); + Register NewDR = MRI->createVirtualRegister(DoubleRC); BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR) .addReg(Pr.first) .addImm(Hexagon::isub_lo) @@ -1145,8 +1145,8 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) { U != W; ++U) SplitIns.insert(U->getParent()); - unsigned LoR = MRI->createVirtualRegister(IntRC); - unsigned HiR = MRI->createVirtualRegister(IntRC); + Register LoR = MRI->createVirtualRegister(IntRC); + Register HiR = MRI->createVirtualRegister(IntRC); LLVM_DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> " << printReg(HiR, TRI) << ':' << printReg(LoR, TRI) << '\n'); diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp index b8b61517ff95..27fefa5f5e2b 100644 --- a/lib/Target/Hexagon/HexagonStoreWidening.cpp +++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp @@ -441,7 +441,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF); - unsigned VReg = MF->getRegInfo().createVirtualRegister(RC); + Register VReg = MF->getRegInfo().createVirtualRegister(RC); MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg) .addImm(int(Acc)); NG.push_back(TfrI); diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 7ec63a642b0c..6c706fea096b 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -119,7 +119,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { FeatureBitset Features = getFeatureBits(); if (HexagonDisableDuplex) - setFeatureBits(Features.set(Hexagon::FeatureDuplex, false)); + setFeatureBits(Features.reset(Hexagon::FeatureDuplex)); setFeatureBits(Hexagon_MC::completeHVXFeatures(Features)); return *this; @@ -230,7 +230,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) { else if (SchedRetvalOptimization) { const MachineInstr *MI = DAG->SUnits[su].getInstr(); if (MI->isCopy() && - TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) { + Register::isPhysicalRegister(MI->getOperand(1).getReg())) { // %vregX = COPY %r0 VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg(); LastVRegUse.erase(MI->getOperand(1).getReg()); @@ -243,8 +243,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) { VRegHoldingReg.count(MO.getReg())) { // <use of %vregX> LastVRegUse[VRegHoldingReg[MO.getReg()]] = &DAG->SUnits[su]; - } else if (MO.isDef() && - TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + } else if (MO.isDef() && Register::isPhysicalRegister(MO.getReg())) { for (MCRegAliasIterator AI(MO.getReg(), &TRI, true); AI.isValid(); ++AI) { if (LastVRegUse.count(*AI) && @@ -345,7 +344,7 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine // the correct latency. if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) { - unsigned DReg = DstInst->getOperand(0).getReg(); + Register DReg = DstInst->getOperand(0).getReg(); MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr(); unsigned UseIdx = -1; for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) { @@ -375,15 +374,15 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, void HexagonSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(llvm::make_unique<UsrOverflowMutation>()); - Mutations.push_back(llvm::make_unique<HVXMemLatencyMutation>()); - Mutations.push_back(llvm::make_unique<BankConflictMutation>()); + Mutations.push_back(std::make_unique<UsrOverflowMutation>()); + Mutations.push_back(std::make_unique<HVXMemLatencyMutation>()); + Mutations.push_back(std::make_unique<BankConflictMutation>()); } void HexagonSubtarget::getSMSMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(llvm::make_unique<UsrOverflowMutation>()); - Mutations.push_back(llvm::make_unique<HVXMemLatencyMutation>()); + Mutations.push_back(std::make_unique<UsrOverflowMutation>()); + Mutations.push_back(std::make_unique<HVXMemLatencyMutation>()); } // Pin the vtable to this file. diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 007423ef1902..31157a0065d9 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -228,7 +228,7 @@ public: } bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const { - if (!VecTy.isVector() || !useHVXOps()) + if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector()) return false; MVT ElemTy = VecTy.getVectorElementType(); if (!IncludeBool && ElemTy == MVT::i1) diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 80b8480448fe..d709a82be660 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -111,10 +111,10 @@ int HexagonTargetMachineModule = 0; static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) { ScheduleDAGMILive *DAG = - new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>()); - DAG->addMutation(make_unique<HexagonSubtarget::UsrOverflowMutation>()); - DAG->addMutation(make_unique<HexagonSubtarget::HVXMemLatencyMutation>()); - DAG->addMutation(make_unique<HexagonSubtarget::CallMutation>()); + new VLIWMachineScheduler(C, std::make_unique<ConvergingVLIWScheduler>()); + DAG->addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>()); + DAG->addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>()); + DAG->addMutation(std::make_unique<HexagonSubtarget::CallMutation>()); DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -218,7 +218,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, TT, CPU, FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), (HexagonNoOpt ? CodeGenOpt::None : OL)), - TLOF(make_unique<HexagonTargetObjectFile>()) { + TLOF(std::make_unique<HexagonTargetObjectFile>()) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); initAsmInfo(); } @@ -244,7 +244,7 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this); + I = std::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this); } return I.get(); } diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 38062e8e922c..ddbc5543348d 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -45,6 +45,8 @@ bool HexagonTTIImpl::useHVX() const { bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { assert(VecTy->isVectorTy()); + if (cast<VectorType>(VecTy)->isScalable()) + return false; // Avoid types like <2 x i32*>. if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy()) return false; diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 27e8fc019007..12ede503af83 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -68,8 +68,8 @@ public: bool shouldFavorPostInc() const; // L1 cache prefetch. - unsigned getPrefetchDistance() const; - unsigned getCacheLineSize() const; + unsigned getPrefetchDistance() const override; + unsigned getCacheLineSize() const override; /// @} diff --git a/lib/Target/Hexagon/HexagonVExtract.cpp b/lib/Target/Hexagon/HexagonVExtract.cpp index a9692f42e468..0c0266a6839a 100644 --- a/lib/Target/Hexagon/HexagonVExtract.cpp +++ b/lib/Target/Hexagon/HexagonVExtract.cpp @@ -67,9 +67,9 @@ unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR, MachineRegisterInfo &MRI) { MachineBasicBlock &ExtB = *ExtI->getParent(); DebugLoc DL = ExtI->getDebugLoc(); - unsigned ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); - unsigned ExtIdxR = ExtI->getOperand(2).getReg(); + Register ExtIdxR = ExtI->getOperand(2).getReg(); unsigned ExtIdxS = ExtI->getOperand(2).getSubReg(); // Simplified check for a compile-time constant value of ExtIdxR. @@ -86,7 +86,7 @@ unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR, } } - unsigned IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::A2_andir), IdxR) .add(ExtI->getOperand(2)) .addImm(-4); @@ -111,7 +111,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) { unsigned Opc = MI.getOpcode(); if (Opc != Hexagon::V6_extractw) continue; - unsigned VecR = MI.getOperand(1).getReg(); + Register VecR = MI.getOperand(1).getReg(); VExtractMap[VecR].push_back(&MI); } } @@ -144,13 +144,13 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock &ExtB = *ExtI->getParent(); DebugLoc DL = ExtI->getDebugLoc(); - unsigned BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + Register BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::PS_fi), BaseR) .addFrameIndex(FI) .addImm(SR == 0 ? 0 : VecSize/2); unsigned ElemR = genElemLoad(ExtI, BaseR, MRI); - unsigned ExtR = ExtI->getOperand(0).getReg(); + Register ExtR = ExtI->getOperand(0).getReg(); MRI.replaceRegWith(ExtR, ElemR); ExtB.erase(ExtI); Changed = true; diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 3619e4c239d7..fab5edefb553 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" @@ -57,9 +58,9 @@ static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Disable Hexagon packetizer pass")); -cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden, - cl::ZeroOrMore, cl::init(true), - cl::desc("Allow slot1 store and slot0 load")); +static cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden, + cl::ZeroOrMore, cl::init(true), + cl::desc("Allow slot1 store and slot0 load")); static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles", cl::ZeroOrMore, cl::Hidden, cl::init(true), @@ -129,16 +130,16 @@ INITIALIZE_PASS_END(HexagonPacketizer, "hexagon-packetizer", "Hexagon Packetizer", false, false) HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF, - MachineLoopInfo &MLI, AliasAnalysis *AA, + MachineLoopInfo &MLI, AAResults *AA, const MachineBranchProbabilityInfo *MBPI, bool Minimal) : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI), Minimal(Minimal) { HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); - addMutation(llvm::make_unique<HexagonSubtarget::UsrOverflowMutation>()); - addMutation(llvm::make_unique<HexagonSubtarget::HVXMemLatencyMutation>()); - addMutation(llvm::make_unique<HexagonSubtarget::BankConflictMutation>()); + addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>()); + addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>()); + addMutation(std::make_unique<HexagonSubtarget::BankConflictMutation>()); } // Check if FirstI modifies a register that SecondI reads. @@ -148,7 +149,7 @@ static bool hasWriteToReadDep(const MachineInstr &FirstI, for (auto &MO : FirstI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned R = MO.getReg(); + Register R = MO.getReg(); if (SecondI.readsRegister(R, TRI)) return true; } @@ -422,7 +423,7 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI, dbgs() << "Checking CUR against "; MJ.dump(); }); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); bool FoundMatch = false; for (auto &MO : MJ.operands()) if (MO.isReg() && MO.getReg() == DestReg) @@ -515,7 +516,7 @@ bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) { unsigned BPJ, OPJ; if (!HII->getBaseAndOffsetPosition(MJ, BPJ, OPJ)) return false; - unsigned Reg = MI.getOperand(BPI).getReg(); + Register Reg = MI.getOperand(BPI).getReg(); if (Reg != MJ.getOperand(BPJ).getReg()) return false; // Make sure that the dependences do not restrict adding MI to the packet. @@ -788,7 +789,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI, return false; if (!MO.isReg() || !MO.isDef() || !MO.isImplicit()) continue; - unsigned R = MO.getReg(); + Register R = MO.getReg(); if (R == DepReg || HRI->isSuperRegister(DepReg, R)) return false; } @@ -1208,7 +1209,7 @@ bool HexagonPacketizerList::hasDeadDependence(const MachineInstr &I, for (auto &MO : J.operands()) { if (!MO.isReg() || !MO.isDef() || !MO.isDead()) continue; - unsigned R = MO.getReg(); + Register R = MO.getReg(); if (R != Hexagon::USR_OVF && DeadDefs[R]) return true; } @@ -1585,7 +1586,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { // subset of the volatile register set. for (const MachineOperand &Op : I.operands()) { if (Op.isReg() && Op.isDef()) { - unsigned R = Op.getReg(); + Register R = Op.getReg(); if (!J.readsRegister(R, HRI) && !J.modifiesRegister(R, HRI)) continue; } else if (!Op.isRegMask()) { @@ -1763,6 +1764,16 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) { void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB, MachineBasicBlock::iterator EndMI) { // Replace VLIWPacketizerList::endPacket(MBB, EndMI). + LLVM_DEBUG({ + if (!CurrentPacketMIs.empty()) { + dbgs() << "Finalizing packet:\n"; + unsigned Idx = 0; + for (MachineInstr *MI : CurrentPacketMIs) { + unsigned R = ResourceTracker->getUsedResources(Idx++); + dbgs() << " * [res:0x" << utohexstr(R) << "] " << *MI; + } + } + }); bool memShufDisabled = getmemShufDisabled(); if (memShufDisabled && !foundLSInPacket()) { diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h index daa86b6f5393..943b9ac7ecc4 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h @@ -69,8 +69,7 @@ private: public: HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, - AliasAnalysis *AA, - const MachineBranchProbabilityInfo *MBPI, + AAResults *AA, const MachineBranchProbabilityInfo *MBPI, bool Minimal); // initPacketizerState - initialize some internal flags. diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 7c0770926abe..75cb398d4097 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -201,9 +201,7 @@ public: bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override { - MCFixupKind Kind = Fixup.getKind(); - - switch((unsigned)Kind) { + switch(Fixup.getTargetKind()) { default: llvm_unreachable("Unknown Fixup Kind!"); @@ -583,7 +581,7 @@ public: return false; // If we cannot resolve the fixup value, it requires relaxation. if (!Resolved) { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case fixup_Hexagon_B22_PCREL: // GetFixupCount assumes B22 won't relax LLVM_FALLTHROUGH; diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index f678bf49322e..cdbeae38b3a1 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -44,7 +44,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx, MCFixup const &Fixup, bool IsPCRel) const { MCSymbolRefExpr::VariantKind Variant = Target.getAccessVariant(); - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: report_fatal_error("Unrecognized relocation type"); break; @@ -299,5 +299,5 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU) { - return llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU); + return std::make_unique<HexagonELFObjectWriter>(OSABI, CPU); } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index fcd3758600c1..8b262bd0248e 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -726,9 +726,6 @@ void HexagonMCChecker::reportNote(SMLoc Loc, llvm::Twine const &Msg) { } void HexagonMCChecker::reportWarning(Twine const &Msg) { - if (ReportErrors) { - auto SM = Context.getSourceManager(); - if (SM) - SM->PrintMessage(MCB.getLoc(), SourceMgr::DK_Warning, Msg); - } + if (ReportErrors) + Context.reportWarning(MCB.getLoc(), Msg); } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index f2432883af6f..a799f7f7c0b9 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -116,8 +116,8 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol, } // Update the maximum alignment of the section if necessary. - if (ByteAlignment > Section.getAlignment()) - Section.setAlignment(ByteAlignment); + if (Align(ByteAlignment) > Section.getAlignment()) + Section.setAlignment(Align(ByteAlignment)); SwitchSection(P.first, P.second); } else { diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 9c50b25156c3..870ab9e94a63 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -72,7 +72,6 @@ cl::opt<bool> MV65("mv65", cl::Hidden, cl::desc("Build for Hexagon V65"), cl::init(false)); cl::opt<bool> MV66("mv66", cl::Hidden, cl::desc("Build for Hexagon V66"), cl::init(false)); -} // namespace cl::opt<Hexagon::ArchEnum> EnableHVX("mhvx", @@ -86,6 +85,7 @@ cl::opt<Hexagon::ArchEnum> clEnumValN(Hexagon::ArchEnum::Generic, "", "")), // Sentinel for flag not present. cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional); +} // namespace static cl::opt<bool> DisableHVX("mno-hvx", cl::Hidden, @@ -264,14 +264,12 @@ createHexagonObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { } static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) { - uint64_t FB = STI->getFeatureBits().to_ullong(); - if (FB & (1ULL << F)) + if (STI->getFeatureBits()[F]) STI->ToggleFeature(F); } static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) { - uint64_t FB = STI->getFeatureBits().to_ullong(); - return (FB & (1ULL << F)) != 0; + return STI->getFeatureBits()[F]; } namespace { @@ -398,7 +396,7 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT, MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS); if (HexagonDisableDuplex) { llvm::FeatureBitset Features = X->getFeatureBits(); - X->setFeatureBits(Features.set(Hexagon::FeatureDuplex, false)); + X->setFeatureBits(Features.reset(Hexagon::FeatureDuplex)); } X->setFeatureBits(completeHVXFeatures(X->getFeatureBits())); diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp index 7702024f87bd..a9d39fd4b2dc 100644 --- a/lib/Target/Hexagon/RDFCopy.cpp +++ b/lib/Target/Hexagon/RDFCopy.cpp @@ -45,8 +45,8 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) { const MachineOperand &Src = MI->getOperand(1); RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg()); RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg()); - assert(TargetRegisterInfo::isPhysicalRegister(DstR.Reg)); - assert(TargetRegisterInfo::isPhysicalRegister(SrcR.Reg)); + assert(Register::isPhysicalRegister(DstR.Reg)); + assert(Register::isPhysicalRegister(SrcR.Reg)); const TargetRegisterInfo &TRI = DFG.getTRI(); if (TRI.getMinimalPhysRegClass(DstR.Reg) != TRI.getMinimalPhysRegClass(SrcR.Reg)) diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp index 52178931aa6d..af86c7b1956b 100644 --- a/lib/Target/Hexagon/RDFDeadCode.cpp +++ b/lib/Target/Hexagon/RDFDeadCode.cpp @@ -16,6 +16,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" #include <queue> diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp index 9d8f706b8a0f..0cb35dc98819 100644 --- a/lib/Target/Hexagon/RDFGraph.cpp +++ b/lib/Target/Hexagon/RDFGraph.cpp @@ -633,7 +633,7 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum) // uses or defs, and those lists do not allow sub-registers. if (Op.getSubReg() != 0) return false; - RegisterId Reg = Op.getReg(); + Register Reg = Op.getReg(); const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs() : D.getImplicitUses(); if (!ImpR) @@ -963,7 +963,7 @@ void DataFlowGraph::build(unsigned Options) { RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const { assert(PhysicalRegisterInfo::isRegMaskId(Reg) || - TargetRegisterInfo::isPhysicalRegister(Reg)); + Register::isPhysicalRegister(Reg)); assert(Reg != 0); if (Sub != 0) Reg = TRI.getSubReg(Reg, Sub); @@ -1291,8 +1291,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { MachineOperand &Op = In.getOperand(OpN); if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) continue; - unsigned R = Op.getReg(); - if (!R || !TargetRegisterInfo::isPhysicalRegister(R)) + Register R = Op.getReg(); + if (!R || !Register::isPhysicalRegister(R)) continue; uint16_t Flags = NodeAttrs::None; if (TOI.isPreserving(In, OpN)) { @@ -1336,8 +1336,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { MachineOperand &Op = In.getOperand(OpN); if (!Op.isReg() || !Op.isDef() || !Op.isImplicit()) continue; - unsigned R = Op.getReg(); - if (!R || !TargetRegisterInfo::isPhysicalRegister(R) || DoneDefs.test(R)) + Register R = Op.getReg(); + if (!R || !Register::isPhysicalRegister(R) || DoneDefs.test(R)) continue; RegisterRef RR = makeRegRef(Op); uint16_t Flags = NodeAttrs::None; @@ -1365,8 +1365,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { MachineOperand &Op = In.getOperand(OpN); if (!Op.isReg() || !Op.isUse()) continue; - unsigned R = Op.getReg(); - if (!R || !TargetRegisterInfo::isPhysicalRegister(R)) + Register R = Op.getReg(); + if (!R || !Register::isPhysicalRegister(R)) continue; uint16_t Flags = NodeAttrs::None; if (Op.isUndef()) diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp index 9cd304aa10bc..7d7b89462ff9 100644 --- a/lib/Target/Hexagon/RDFLiveness.cpp +++ b/lib/Target/Hexagon/RDFLiveness.cpp @@ -889,8 +889,8 @@ void Liveness::resetKills(MachineBasicBlock *B) { // implicit defs. if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isPhysicalRegister(R)) + Register R = Op.getReg(); + if (!Register::isPhysicalRegister(R)) continue; for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) Live.reset(*SR); @@ -898,8 +898,8 @@ void Liveness::resetKills(MachineBasicBlock *B) { for (auto &Op : MI->operands()) { if (!Op.isReg() || !Op.isUse() || Op.isUndef()) continue; - unsigned R = Op.getReg(); - if (!TargetRegisterInfo::isPhysicalRegister(R)) + Register R = Op.getReg(); + if (!Register::isPhysicalRegister(R)) continue; bool IsLive = false; for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) { diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp index 6e0f33695f0e..b5675784e34b 100644 --- a/lib/Target/Hexagon/RDFRegisters.cpp +++ b/lib/Target/Hexagon/RDFRegisters.cpp @@ -101,7 +101,7 @@ RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const { std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const { // Do not include RR in the alias set. std::set<RegisterId> AS; - assert(isRegMaskId(Reg) || TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(isRegMaskId(Reg) || Register::isPhysicalRegister(Reg)); if (isRegMaskId(Reg)) { // XXX SLOW const uint32_t *MB = getRegMaskBits(Reg); @@ -129,8 +129,8 @@ std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const { } bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const { - assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg)); - assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg)); + assert(Register::isPhysicalRegister(RA.Reg)); + assert(Register::isPhysicalRegister(RB.Reg)); MCRegUnitMaskIterator UMA(RA.Reg, &TRI); MCRegUnitMaskIterator UMB(RB.Reg, &TRI); @@ -160,7 +160,7 @@ bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const { } bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const { - assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg)); + assert(Register::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg)); const uint32_t *MB = getRegMaskBits(RM.Reg); bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32)); // If the lane mask information is "full", e.g. when the given lane mask diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h index 646233bacda5..4afaf80e4659 100644 --- a/lib/Target/Hexagon/RDFRegisters.h +++ b/lib/Target/Hexagon/RDFRegisters.h @@ -99,15 +99,15 @@ namespace rdf { const MachineFunction &mf); static bool isRegMaskId(RegisterId R) { - return TargetRegisterInfo::isStackSlot(R); + return Register::isStackSlot(R); } RegisterId getRegMaskId(const uint32_t *RM) const { - return TargetRegisterInfo::index2StackSlot(RegMasks.find(RM)); + return Register::index2StackSlot(RegMasks.find(RM)); } const uint32_t *getRegMaskBits(RegisterId R) const { - return RegMasks.get(TargetRegisterInfo::stackSlot2Index(R)); + return RegMasks.get(Register::stackSlot2Index(R)); } RegisterRef normalize(RegisterRef RR) const; @@ -125,7 +125,7 @@ namespace rdf { } const BitVector &getMaskUnits(RegisterId MaskId) const { - return MaskInfos[TargetRegisterInfo::stackSlot2Index(MaskId)].Units; + return MaskInfos[Register::stackSlot2Index(MaskId)].Units; } RegisterRef mapTo(RegisterRef RR, unsigned R) const; diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index 9af8a0b35b2f..ec82e3a41f2a 100644 --- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -469,13 +469,14 @@ public: else if (isa<LanaiMCExpr>(getImm())) { #ifndef NDEBUG const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm()); - assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO); + assert(SymbolRefExpr && + SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO); #endif Inst.addOperand(MCOperand::createExpr(getImm())); } else if (isa<MCBinaryExpr>(getImm())) { #ifndef NDEBUG const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm()); - assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) && + assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) && cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO); #endif @@ -499,13 +500,14 @@ public: else if (isa<LanaiMCExpr>(getImm())) { #ifndef NDEBUG const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm()); - assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI); + assert(SymbolRefExpr && + SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI); #endif Inst.addOperand(MCOperand::createExpr(getImm())); } else if (isa<MCBinaryExpr>(getImm())) { #ifndef NDEBUG const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm()); - assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) && + assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) && cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI); #endif @@ -544,10 +546,9 @@ public: } else if (isa<MCBinaryExpr>(getImm())) { #ifndef NDEBUG const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm()); - const LanaiMCExpr *SymbolRefExpr = - dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()); - assert(SymbolRefExpr && - SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None); + assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) && + cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() == + LanaiMCExpr::VK_Lanai_None); #endif Inst.addOperand(MCOperand::createExpr(getImm())); } else @@ -580,7 +581,7 @@ public: } static std::unique_ptr<LanaiOperand> CreateToken(StringRef Str, SMLoc Start) { - auto Op = make_unique<LanaiOperand>(TOKEN); + auto Op = std::make_unique<LanaiOperand>(TOKEN); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = Start; @@ -590,7 +591,7 @@ public: static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start, SMLoc End) { - auto Op = make_unique<LanaiOperand>(REGISTER); + auto Op = std::make_unique<LanaiOperand>(REGISTER); Op->Reg.RegNum = RegNum; Op->StartLoc = Start; Op->EndLoc = End; @@ -599,7 +600,7 @@ public: static std::unique_ptr<LanaiOperand> createImm(const MCExpr *Value, SMLoc Start, SMLoc End) { - auto Op = make_unique<LanaiOperand>(IMMEDIATE); + auto Op = std::make_unique<LanaiOperand>(IMMEDIATE); Op->Imm.Value = Value; Op->StartLoc = Start; Op->EndLoc = End; diff --git a/lib/Target/Lanai/LanaiAsmPrinter.cpp b/lib/Target/Lanai/LanaiAsmPrinter.cpp index 64d963475e1a..12a3202446a8 100644 --- a/lib/Target/Lanai/LanaiAsmPrinter.cpp +++ b/lib/Target/Lanai/LanaiAsmPrinter.cpp @@ -133,7 +133,7 @@ bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO = MI->getOperand(RegOp); if (!MO.isReg()) return true; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); O << LanaiInstPrinter::getRegisterName(Reg); return false; } diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp index 09c63dca23e2..b9e577d201f9 100644 --- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp +++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Simple pass to fills delay slots with useful instructions. +// Simple pass to fill delay slots with useful instructions. // //===----------------------------------------------------------------------===// diff --git a/lib/Target/Lanai/LanaiFrameLowering.cpp b/lib/Target/Lanai/LanaiFrameLowering.cpp index 142c09c504cc..eddc2b8e61f7 100644 --- a/lib/Target/Lanai/LanaiFrameLowering.cpp +++ b/lib/Target/Lanai/LanaiFrameLowering.cpp @@ -72,8 +72,8 @@ void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const { MachineInstr &MI = *MBBI++; if (MI.getOpcode() == Lanai::ADJDYNALLOC) { DebugLoc DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst) .addReg(Src) diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h index 5fe4535543ec..380d63df7301 100644 --- a/lib/Target/Lanai/LanaiFrameLowering.h +++ b/lib/Target/Lanai/LanaiFrameLowering.h @@ -31,7 +31,7 @@ protected: public: explicit LanaiFrameLowering(const LanaiSubtarget &Subtarget) : TargetFrameLowering(StackGrowsDown, - /*StackAlignment=*/8, + /*StackAlignment=*/Align(8), /*LocalAreaOffset=*/0), STI(Subtarget) {} diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp index 1ed078bb433f..43933d062a7e 100644 --- a/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/lib/Target/Lanai/LanaiISelLowering.cpp @@ -144,9 +144,9 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); - // Function alignments (log2) - setMinFunctionAlignment(2); - setPrefFunctionAlignment(2); + // Function alignments + setMinFunctionAlignment(Align(4)); + setPrefFunctionAlignment(Align(4)); setJumpIsExpensive(true); @@ -212,10 +212,11 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op, // Lanai Inline Assembly Support //===----------------------------------------------------------------------===// -unsigned LanaiTargetLowering::getRegisterByName(const char *RegName, EVT /*VT*/, - SelectionDAG & /*DAG*/) const { +Register LanaiTargetLowering::getRegisterByName( + const char *RegName, EVT /*VT*/, + const MachineFunction & /*MF*/) const { // Only unallocatable registers should be matched here. - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<unsigned>(RegName) .Case("pc", Lanai::PC) .Case("sp", Lanai::SP) .Case("fp", Lanai::FP) @@ -459,7 +460,7 @@ SDValue LanaiTargetLowering::LowerCCCArguments( EVT RegVT = VA.getLocVT(); switch (RegVT.getSimpleVT().SimpleTy) { case MVT::i32: { - unsigned VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass); + Register VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT); diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h index e7b5755e9041..4c35a2c6fb8e 100644 --- a/lib/Target/Lanai/LanaiISelLowering.h +++ b/lib/Target/Lanai/LanaiISelLowering.h @@ -90,8 +90,8 @@ public: SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; - unsigned getRegisterByName(const char *RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char *RegName, EVT VT, + const MachineFunction &MF) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp index 700a86069102..b950fd0424ef 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -86,8 +86,7 @@ void LanaiInstrInfo::loadRegFromStackSlot( } bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint( - const MachineInstr &MIa, const MachineInstr &MIb, - AliasAnalysis * /*AA*/) const { + const MachineInstr &MIa, const MachineInstr &MIb) const { assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); @@ -457,7 +456,7 @@ bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI, // return the defining instruction. static MachineInstr *canFoldIntoSelect(unsigned Reg, const MachineRegisterInfo &MRI) { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) return nullptr; @@ -479,7 +478,7 @@ static MachineInstr *canFoldIntoSelect(unsigned Reg, // MI can't have any tied operands, that would conflict with predication. if (MO.isTied()) return nullptr; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) return nullptr; if (MO.isDef() && !MO.isDead()) return nullptr; @@ -505,7 +504,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI, // Find new register class to use. MachineOperand FalseReg = MI.getOperand(Invert ? 1 : 2); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg()); if (!MRI.constrainRegClass(DestReg, PreviousClass)) return nullptr; diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h index d71424aeb0b1..59a04d2cc388 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.h +++ b/lib/Target/Lanai/LanaiInstrInfo.h @@ -36,8 +36,7 @@ public: } bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA) const override; + const MachineInstr &MIb) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp index d3056a1eba8e..7c28debb94dd 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.cpp +++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp @@ -155,7 +155,7 @@ void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0)) Offset += MF.getFrameInfo().getStackSize(); - unsigned FrameReg = getFrameRegister(MF); + Register FrameReg = getFrameRegister(MF); if (FrameIndex >= 0) { if (hasBasePointer(MF)) FrameReg = getBaseRegister(); diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp index 4313fa5a82b5..919d43ad9b9b 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp @@ -88,5 +88,5 @@ bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/, std::unique_ptr<MCObjectTargetWriter> llvm::createLanaiELFObjectWriter(uint8_t OSABI) { - return llvm::make_unique<LanaiELFObjectWriter>(OSABI); + return std::make_unique<LanaiELFObjectWriter>(OSABI); } diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index a0ec14ae2381..85dcc0f152f9 100644 --- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -191,33 +191,33 @@ public: } static std::unique_ptr<MSP430Operand> CreateToken(StringRef Str, SMLoc S) { - return make_unique<MSP430Operand>(Str, S); + return std::make_unique<MSP430Operand>(Str, S); } static std::unique_ptr<MSP430Operand> CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { - return make_unique<MSP430Operand>(k_Reg, RegNum, S, E); + return std::make_unique<MSP430Operand>(k_Reg, RegNum, S, E); } static std::unique_ptr<MSP430Operand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - return make_unique<MSP430Operand>(Val, S, E); + return std::make_unique<MSP430Operand>(Val, S, E); } static std::unique_ptr<MSP430Operand> CreateMem(unsigned RegNum, const MCExpr *Val, SMLoc S, SMLoc E) { - return make_unique<MSP430Operand>(RegNum, Val, S, E); + return std::make_unique<MSP430Operand>(RegNum, Val, S, E); } static std::unique_ptr<MSP430Operand> CreateIndReg(unsigned RegNum, SMLoc S, SMLoc E) { - return make_unique<MSP430Operand>(k_IndReg, RegNum, S, E); + return std::make_unique<MSP430Operand>(k_IndReg, RegNum, S, E); } static std::unique_ptr<MSP430Operand> CreatePostIndReg(unsigned RegNum, SMLoc S, SMLoc E) { - return make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E); + return std::make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E); } SMLoc getStartLoc() const { return Start; } diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp index 38b7da32c246..0cdd1f4f701f 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp @@ -31,7 +31,7 @@ protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override { // Translate fixup kind to ELF relocation type. - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case FK_Data_1: return ELF::R_MSP430_8; case FK_Data_2: return ELF::R_MSP430_16_BYTE; case FK_Data_4: return ELF::R_MSP430_32; @@ -54,5 +54,5 @@ protected: std::unique_ptr<MCObjectTargetWriter> llvm::createMSP430ELFObjectWriter(uint8_t OSABI) { - return llvm::make_unique<MSP430ELFObjectWriter>(OSABI); + return std::make_unique<MSP430ELFObjectWriter>(OSABI); } diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp index 3a71a084d1af..a3b91acdc6d0 100644 --- a/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -159,8 +159,9 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) { void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) { MCSection *Cur = OutStreamer->getCurrentSectionOnly(); const auto *F = &ISR.getFunction(); - assert(F->hasFnAttribute("interrupt") && - "Functions with MSP430_INTR CC should have 'interrupt' attribute"); + if (F->getCallingConv() != CallingConv::MSP430_INTR) { + report_fatal_error("Functions with 'interrupt' attribute must have msp430_intrcc CC"); + } StringRef IVIdx = F->getFnAttribute("interrupt").getValueAsString(); MCSection *IV = OutStreamer->getContext().getELFSection( "__interrupt_vector_" + IVIdx, @@ -174,8 +175,9 @@ void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) { bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) { // Emit separate section for an interrupt vector if ISR - if (MF.getFunction().getCallingConv() == CallingConv::MSP430_INTR) + if (MF.getFunction().hasFnAttribute("interrupt")) { EmitInterruptVectorSection(MF); + } SetupMachineFunction(MF); EmitFunctionBody(); diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp index 45e7c26e4d30..ce5affdc25b0 100644 --- a/lib/Target/MSP430/MSP430BranchSelector.cpp +++ b/lib/Target/MSP430/MSP430BranchSelector.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index 33ce3c70a2a3..70e284053021 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -22,7 +22,8 @@ protected: public: explicit MSP430FrameLowering() - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {} + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(2), -2, + Align(2)) {} /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index fedfb857bd0f..64169d1f5eb1 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -327,8 +327,8 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN); // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll - setMinFunctionAlignment(1); - setPrefFunctionAlignment(1); + setMinFunctionAlignment(Align(2)); + setPrefFunctionAlignment(Align(2)); } SDValue MSP430TargetLowering::LowerOperation(SDValue Op, @@ -353,6 +353,9 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op, } } +unsigned MSP430TargetLowering::getShiftAmountThreshold(EVT VT) const { + return 2; +} //===----------------------------------------------------------------------===// // MSP430 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -632,7 +635,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments( llvm_unreachable(nullptr); } case MVT::i16: - unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass); + Register VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); @@ -1446,8 +1449,8 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI, case MSP430::Rrcl16: { BuildMI(*BB, MI, dl, TII.get(MSP430::BIC16rc), MSP430::SR) .addReg(MSP430::SR).addImm(1); - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); unsigned RrcOpc = MI.getOpcode() == MSP430::Rrcl16 ? MSP430::RRC16r : MSP430::RRC8r; BuildMI(*BB, MI, dl, TII.get(RrcOpc), DstReg) @@ -1479,13 +1482,13 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI, LoopBB->addSuccessor(RemBB); LoopBB->addSuccessor(LoopBB); - unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass); - unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass); - unsigned ShiftReg = RI.createVirtualRegister(RC); - unsigned ShiftReg2 = RI.createVirtualRegister(RC); - unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass); + Register ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass); + Register ShiftReg = RI.createVirtualRegister(RC); + Register ShiftReg2 = RI.createVirtualRegister(RC); + Register ShiftAmtSrcReg = MI.getOperand(2).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); // BB: // cmp 0, N diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index ee6b6316d7a9..9224e5e3d005 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -124,6 +124,8 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + unsigned getShiftAmountThreshold(EVT VT) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index afbb2f213b45..bec357a1548d 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -139,7 +139,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, return; // We need to materialize the offset via add instruction. - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (Offset < 0) BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg) .addReg(DstReg).addImm(-Offset); diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 8c4ca982c966..e9aeba76de85 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -46,7 +46,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - TLOF(make_unique<TargetLoweringObjectFileELF>()), + TLOF(std::make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 1f7d095bf49b..21d0df74d458 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -39,6 +39,7 @@ #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -233,9 +234,14 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); - bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU, - SMLoc IDLoc, MCStreamer &Out, - const MCSubtargetInfo *STI); + bool expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU, SMLoc IDLoc, + MCStreamer &Out, const MCSubtargetInfo *STI); bool expandLoadAddress(unsigned DstReg, unsigned BaseReg, const MCOperand &Offset, bool Is32BitAddress, @@ -512,11 +518,11 @@ public: // Remember the initial assembler options. The user can not modify these. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); + std::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); // Create an assembler options environment for the user to modify. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); + std::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); getTargetStreamer().updateABIInfo(*this); @@ -844,7 +850,7 @@ private: const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { - auto Op = llvm::make_unique<MipsOperand>(k_RegisterIndex, Parser); + auto Op = std::make_unique<MipsOperand>(k_RegisterIndex, Parser); Op->RegIdx.Index = Index; Op->RegIdx.RegInfo = RegInfo; Op->RegIdx.Kind = RegKind; @@ -1446,7 +1452,7 @@ public: static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S, MipsAsmParser &Parser) { - auto Op = llvm::make_unique<MipsOperand>(k_Token, Parser); + auto Op = std::make_unique<MipsOperand>(k_Token, Parser); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -1521,7 +1527,7 @@ public: static std::unique_ptr<MipsOperand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) { - auto Op = llvm::make_unique<MipsOperand>(k_Immediate, Parser); + auto Op = std::make_unique<MipsOperand>(k_Immediate, Parser); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -1531,7 +1537,7 @@ public: static std::unique_ptr<MipsOperand> CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S, SMLoc E, MipsAsmParser &Parser) { - auto Op = llvm::make_unique<MipsOperand>(k_Memory, Parser); + auto Op = std::make_unique<MipsOperand>(k_Memory, Parser); Op->Mem.Base = Base.release(); Op->Mem.Off = Off; Op->StartLoc = S; @@ -1544,7 +1550,7 @@ public: MipsAsmParser &Parser) { assert(Regs.size() > 0 && "Empty list not allowed"); - auto Op = llvm::make_unique<MipsOperand>(k_RegList, Parser); + auto Op = std::make_unique<MipsOperand>(k_RegList, Parser); Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end()); Op->StartLoc = StartLoc; Op->EndLoc = EndLoc; @@ -1804,8 +1810,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; // We'll deal with this situation later on when applying fixups. if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), - 1LL << (inMicroMipsMode() ? 1 : 2))) + if (offsetToAlignment(Offset.getImm(), + (inMicroMipsMode() ? Align(2) : Align(4)))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BGEZ: @@ -1834,8 +1840,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; // We'll deal with this situation later on when applying fixups. if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), - 1LL << (inMicroMipsMode() ? 1 : 2))) + if (offsetToAlignment(Offset.getImm(), + (inMicroMipsMode() ? Align(2) : Align(4)))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BGEC: case Mips::BGEC_MMR6: @@ -1850,7 +1856,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; // We'll deal with this situation later on when applying fixups. if (!isIntN(18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << 2)) + if (offsetToAlignment(Offset.getImm(), Align(4))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BLEZC: case Mips::BLEZC_MMR6: @@ -1863,7 +1869,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; // We'll deal with this situation later on when applying fixups. if (!isIntN(18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << 2)) + if (offsetToAlignment(Offset.getImm(), Align(4))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BEQZC: case Mips::BEQZC_MMR6: @@ -1874,7 +1880,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; // We'll deal with this situation later on when applying fixups. if (!isIntN(23, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << 2)) + if (offsetToAlignment(Offset.getImm(), Align(4))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BEQZ16_MM: @@ -1887,7 +1893,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; // We'll deal with this situation later on when applying fixups. if (!isInt<8>(Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 2LL)) + if (offsetToAlignment(Offset.getImm(), Align(2))) return Error(IDLoc, "branch to misaligned address"); break; } @@ -2454,25 +2460,21 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, : MER_Success; case Mips::LoadImmSingleGPR: - return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI) - ? MER_Fail - : MER_Success; + return expandLoadSingleImmToGPR(Inst, IDLoc, Out, STI) ? MER_Fail + : MER_Success; case Mips::LoadImmSingleFGR: - return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI) - ? MER_Fail - : MER_Success; + return expandLoadSingleImmToFPR(Inst, IDLoc, Out, STI) ? MER_Fail + : MER_Success; case Mips::LoadImmDoubleGPR: - return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI) - ? MER_Fail - : MER_Success; + return expandLoadDoubleImmToGPR(Inst, IDLoc, Out, STI) ? MER_Fail + : MER_Success; case Mips::LoadImmDoubleFGR: - return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI) - ? MER_Fail - : MER_Success; + return expandLoadDoubleImmToFPR(Inst, true, IDLoc, Out, STI) ? MER_Fail + : MER_Success; case Mips::LoadImmDoubleFGR_32: - return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI) - ? MER_Fail - : MER_Success; + return expandLoadDoubleImmToFPR(Inst, false, IDLoc, Out, STI) ? MER_Fail + : MER_Success; + case Mips::Ulh: return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::Ulhu: @@ -2868,12 +2870,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, bool Is32BitSym, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { - // FIXME: These expansions do not respect -mxgot. MipsTargetStreamer &TOut = getTargetStreamer(); - bool UseSrcReg = SrcReg != Mips::NoRegister; + bool UseSrcReg = SrcReg != Mips::NoRegister && SrcReg != Mips::ZERO && + SrcReg != Mips::ZERO_64; warnIfNoMacro(IDLoc); - if (inPicMode() && ABI.IsO32()) { + if (inPicMode()) { MCValue Res; if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) { Error(IDLoc, "expected relocatable expression"); @@ -2884,46 +2886,41 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, return true; } + bool IsPtr64 = ABI.ArePtrs64bit(); + bool IsLocalSym = + Res.getSymA()->getSymbol().isInSection() || + Res.getSymA()->getSymbol().isTemporary() || + (Res.getSymA()->getSymbol().isELF() && + cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() == + ELF::STB_LOCAL); + bool UseXGOT = STI->getFeatureBits()[Mips::FeatureXGOT] && !IsLocalSym; + // The case where the result register is $25 is somewhat special. If the // symbol in the final relocation is external and not modified with a - // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16. + // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16 + // or R_MIPS_CALL16 instead of R_MIPS_GOT_DISP in 64-bit case. if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg && - Res.getConstant() == 0 && - !(Res.getSymA()->getSymbol().isInSection() || - Res.getSymA()->getSymbol().isTemporary() || - (Res.getSymA()->getSymbol().isELF() && - cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() == - ELF::STB_LOCAL))) { - const MCExpr *CallExpr = - MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); - TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr), - IDLoc, STI); + Res.getConstant() == 0 && !IsLocalSym) { + if (UseXGOT) { + const MCExpr *CallHiExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16, + SymExpr, getContext()); + const MCExpr *CallLoExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16, + SymExpr, getContext()); + TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc, + STI); + TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg, + IDLoc, STI); + TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, DstReg, + MCOperand::createExpr(CallLoExpr), IDLoc, STI); + } else { + const MCExpr *CallExpr = + MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); + TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg, + MCOperand::createExpr(CallExpr), IDLoc, STI); + } return false; } - // The remaining cases are: - // External GOT: lw $tmp, %got(symbol+offset)($gp) - // >addiu $tmp, $tmp, %lo(offset) - // >addiu $rd, $tmp, $rs - // Local GOT: lw $tmp, %got(symbol+offset)($gp) - // addiu $tmp, $tmp, %lo(symbol+offset)($gp) - // >addiu $rd, $tmp, $rs - // The addiu's marked with a '>' may be omitted if they are redundant. If - // this happens then the last instruction must use $rd as the result - // register. - const MipsMCExpr *GotExpr = - MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext()); - const MCExpr *LoExpr = nullptr; - if (Res.getSymA()->getSymbol().isInSection() || - Res.getSymA()->getSymbol().isTemporary()) - LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext()); - else if (Res.getConstant() != 0) { - // External symbols fully resolve the symbol with just the %got(symbol) - // but we must still account for any offset to the symbol for expressions - // like symbol+8. - LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); - } - unsigned TmpReg = DstReg; if (UseSrcReg && getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, @@ -2936,94 +2933,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, TmpReg = ATReg; } - TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, - STI); + if (UseXGOT) { + // Loading address from XGOT + // External GOT: lui $tmp, %got_hi(symbol)($gp) + // addu $tmp, $tmp, $gp + // lw $tmp, %got_lo(symbol)($tmp) + // >addiu $tmp, $tmp, offset + // >addiu $rd, $tmp, $rs + // The addiu's marked with a '>' may be omitted if they are redundant. If + // this happens then the last instruction must use $rd as the result + // register. + const MCExpr *CallHiExpr = + MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, SymExpr, getContext()); + const MCExpr *CallLoExpr = MipsMCExpr::create( + MipsMCExpr::MEK_GOT_LO16, Res.getSymA(), getContext()); - if (LoExpr) - TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), + TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc, + STI); + TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg, GPReg, IDLoc, STI); + TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, TmpReg, TmpReg, + MCOperand::createExpr(CallLoExpr), IDLoc, STI); - if (UseSrcReg) - TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI); + if (Res.getConstant() != 0) + TOut.emitRRX(IsPtr64 ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg, + MCOperand::createExpr(MCConstantExpr::create( + Res.getConstant(), getContext())), + IDLoc, STI); - return false; - } - - if (inPicMode() && ABI.ArePtrs64bit()) { - MCValue Res; - if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) { - Error(IDLoc, "expected relocatable expression"); - return true; - } - if (Res.getSymB() != nullptr) { - Error(IDLoc, "expected relocatable expression with only one symbol"); - return true; - } - - // The case where the result register is $25 is somewhat special. If the - // symbol in the final relocation is external and not modified with a - // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP. - if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg && - Res.getConstant() == 0 && - !(Res.getSymA()->getSymbol().isInSection() || - Res.getSymA()->getSymbol().isTemporary() || - (Res.getSymA()->getSymbol().isELF() && - cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() == - ELF::STB_LOCAL))) { - const MCExpr *CallExpr = - MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); - TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr), - IDLoc, STI); + if (UseSrcReg) + TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, TmpReg, SrcReg, + IDLoc, STI); return false; } - // The remaining cases are: - // Small offset: ld $tmp, %got_disp(symbol)($gp) - // >daddiu $tmp, $tmp, offset - // >daddu $rd, $tmp, $rs - // The daddiu's marked with a '>' may be omitted if they are redundant. If - // this happens then the last instruction must use $rd as the result - // register. - const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, - Res.getSymA(), - getContext()); + const MipsMCExpr *GotExpr = nullptr; const MCExpr *LoExpr = nullptr; - if (Res.getConstant() != 0) { - // Symbols fully resolve with just the %got_disp(symbol) but we - // must still account for any offset to the symbol for - // expressions like symbol+8. - LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); + if (IsPtr64) { + // The remaining cases are: + // Small offset: ld $tmp, %got_disp(symbol)($gp) + // >daddiu $tmp, $tmp, offset + // >daddu $rd, $tmp, $rs + // The daddiu's marked with a '>' may be omitted if they are redundant. If + // this happens then the last instruction must use $rd as the result + // register. + GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, Res.getSymA(), + getContext()); + if (Res.getConstant() != 0) { + // Symbols fully resolve with just the %got_disp(symbol) but we + // must still account for any offset to the symbol for + // expressions like symbol+8. + LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); - // FIXME: Offsets greater than 16 bits are not yet implemented. - // FIXME: The correct range is a 32-bit sign-extended number. - if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) { - Error(IDLoc, "macro instruction uses large offset, which is not " - "currently supported"); - return true; + // FIXME: Offsets greater than 16 bits are not yet implemented. + // FIXME: The correct range is a 32-bit sign-extended number. + if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) { + Error(IDLoc, "macro instruction uses large offset, which is not " + "currently supported"); + return true; + } + } + } else { + // The remaining cases are: + // External GOT: lw $tmp, %got(symbol)($gp) + // >addiu $tmp, $tmp, offset + // >addiu $rd, $tmp, $rs + // Local GOT: lw $tmp, %got(symbol+offset)($gp) + // addiu $tmp, $tmp, %lo(symbol+offset)($gp) + // >addiu $rd, $tmp, $rs + // The addiu's marked with a '>' may be omitted if they are redundant. If + // this happens then the last instruction must use $rd as the result + // register. + if (IsLocalSym) { + GotExpr = + MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext()); + LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext()); + } else { + // External symbols fully resolve the symbol with just the %got(symbol) + // but we must still account for any offset to the symbol for + // expressions like symbol+8. + GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, Res.getSymA(), + getContext()); + if (Res.getConstant() != 0) + LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); } } - unsigned TmpReg = DstReg; - if (UseSrcReg && - getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, - SrcReg)) { - // If $rs is the same as $rd, we need to use AT. - // If it is not available we exit. - unsigned ATReg = getATReg(IDLoc); - if (!ATReg) - return true; - TmpReg = ATReg; - } - - TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, - STI); + TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, TmpReg, GPReg, + MCOperand::createExpr(GotExpr), IDLoc, STI); if (LoExpr) - TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), - IDLoc, STI); + TOut.emitRRX(IsPtr64 ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg, + MCOperand::createExpr(LoExpr), IDLoc, STI); if (UseSrcReg) - TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI); + TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, TmpReg, SrcReg, + IDLoc, STI); return false; } @@ -3289,10 +3294,43 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, return false; } -bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, - bool Is64FPU, SMLoc IDLoc, - MCStreamer &Out, - const MCSubtargetInfo *STI) { +static uint64_t convertIntToDoubleImm(uint64_t ImmOp64) { + // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the + // exponent field), convert it to double (e.g. 1 to 1.0) + if ((Hi_32(ImmOp64) & 0x7ff00000) == 0) { + APFloat RealVal(APFloat::IEEEdouble(), ImmOp64); + ImmOp64 = RealVal.bitcastToAPInt().getZExtValue(); + } + return ImmOp64; +} + +static uint32_t covertDoubleImmToSingleImm(uint64_t ImmOp64) { + // Conversion of a double in an uint64_t to a float in a uint32_t, + // retaining the bit pattern of a float. + double DoubleImm = BitsToDouble(ImmOp64); + float TmpFloat = static_cast<float>(DoubleImm); + return FloatToBits(TmpFloat); +} + +bool MipsAsmParser::expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out, + const MCSubtargetInfo *STI) { + assert(Inst.getNumOperands() == 2 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() && + "Invalid instruction operand."); + + unsigned FirstReg = Inst.getOperand(0).getReg(); + uint64_t ImmOp64 = Inst.getOperand(1).getImm(); + + uint32_t ImmOp32 = covertDoubleImmToSingleImm(convertIntToDoubleImm(ImmOp64)); + + return loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, false, IDLoc, + Out, STI); +} + +bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out, + const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 2 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() && @@ -3301,166 +3339,184 @@ bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, unsigned FirstReg = Inst.getOperand(0).getReg(); uint64_t ImmOp64 = Inst.getOperand(1).getImm(); - uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32; - // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the - // exponent field), convert it to double (e.g. 1 to 1.0) - if ((HiImmOp64 & 0x7ff00000) == 0) { - APFloat RealVal(APFloat::IEEEdouble(), ImmOp64); - ImmOp64 = RealVal.bitcastToAPInt().getZExtValue(); + ImmOp64 = convertIntToDoubleImm(ImmOp64); + + uint32_t ImmOp32 = covertDoubleImmToSingleImm(ImmOp64); + + unsigned TmpReg = Mips::ZERO; + if (ImmOp32 != 0) { + TmpReg = getATReg(IDLoc); + if (!TmpReg) + return true; } - uint32_t LoImmOp64 = ImmOp64 & 0xffffffff; - HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32; + if (Lo_32(ImmOp64) == 0) { + if (TmpReg != Mips::ZERO && loadImmediate(ImmOp32, TmpReg, Mips::NoRegister, + true, false, IDLoc, Out, STI)) + return true; + TOut.emitRR(Mips::MTC1, FirstReg, TmpReg, IDLoc, STI); + return false; + } - if (IsSingle) { - // Conversion of a double in an uint64_t to a float in a uint32_t, - // retaining the bit pattern of a float. - uint32_t ImmOp32; - double doubleImm = BitsToDouble(ImmOp64); - float tmp_float = static_cast<float>(doubleImm); - ImmOp32 = FloatToBits(tmp_float); + MCSection *CS = getStreamer().getCurrentSectionOnly(); + // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections + // where appropriate. + MCSection *ReadOnlySection = + getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); - if (IsGPR) { - if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc, - Out, STI)) - return true; - return false; - } else { - unsigned ATReg = getATReg(IDLoc); - if (!ATReg) - return true; - if (LoImmOp64 == 0) { - if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc, - Out, STI)) - return true; - TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI); - return false; - } + MCSymbol *Sym = getContext().createTempSymbol(); + const MCExpr *LoSym = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); + const MipsMCExpr *LoExpr = + MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); - MCSection *CS = getStreamer().getCurrentSectionOnly(); - // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections - // where appropriate. - MCSection *ReadOnlySection = getContext().getELFSection( - ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + getStreamer().SwitchSection(ReadOnlySection); + getStreamer().EmitLabel(Sym, IDLoc); + getStreamer().EmitIntValue(ImmOp32, 4); + getStreamer().SwitchSection(CS); - MCSymbol *Sym = getContext().createTempSymbol(); - const MCExpr *LoSym = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); - const MipsMCExpr *LoExpr = - MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); + if (emitPartialAddress(TOut, IDLoc, Sym)) + return true; + TOut.emitRRX(Mips::LWC1, FirstReg, TmpReg, MCOperand::createExpr(LoExpr), + IDLoc, STI); + return false; +} + +bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + assert(Inst.getNumOperands() == 2 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() && + "Invalid instruction operand."); - getStreamer().SwitchSection(ReadOnlySection); - getStreamer().EmitLabel(Sym, IDLoc); - getStreamer().EmitIntValue(ImmOp32, 4); - getStreamer().SwitchSection(CS); + unsigned FirstReg = Inst.getOperand(0).getReg(); + uint64_t ImmOp64 = Inst.getOperand(1).getImm(); - if(emitPartialAddress(TOut, IDLoc, Sym)) + ImmOp64 = convertIntToDoubleImm(ImmOp64); + + if (Lo_32(ImmOp64) == 0) { + if (isGP64bit()) { + if (loadImmediate(ImmOp64, FirstReg, Mips::NoRegister, false, false, + IDLoc, Out, STI)) + return true; + } else { + if (loadImmediate(Hi_32(ImmOp64), FirstReg, Mips::NoRegister, true, false, + IDLoc, Out, STI)) + return true; + + if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, false, + IDLoc, Out, STI)) return true; - TOut.emitRRX(Mips::LWC1, FirstReg, ATReg, - MCOperand::createExpr(LoExpr), IDLoc, STI); } return false; } - // if(!IsSingle) - unsigned ATReg = getATReg(IDLoc); - if (!ATReg) + MCSection *CS = getStreamer().getCurrentSectionOnly(); + MCSection *ReadOnlySection = + getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + + MCSymbol *Sym = getContext().createTempSymbol(); + const MCExpr *LoSym = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); + const MipsMCExpr *LoExpr = + MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); + + getStreamer().SwitchSection(ReadOnlySection); + getStreamer().EmitLabel(Sym, IDLoc); + getStreamer().EmitValueToAlignment(8); + getStreamer().EmitIntValue(ImmOp64, 8); + getStreamer().SwitchSection(CS); + + unsigned TmpReg = getATReg(IDLoc); + if (!TmpReg) return true; - if (IsGPR) { - if (LoImmOp64 == 0) { - if(isABI_N32() || isABI_N64()) { - if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true, - IDLoc, Out, STI)) - return true; - return false; - } else { - if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true, - IDLoc, Out, STI)) - return true; + if (emitPartialAddress(TOut, IDLoc, Sym)) + return true; - if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true, - IDLoc, Out, STI)) - return true; - return false; - } - } + TOut.emitRRX(isABI_N64() ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg, + MCOperand::createExpr(LoExpr), IDLoc, STI); - MCSection *CS = getStreamer().getCurrentSectionOnly(); - MCSection *ReadOnlySection = getContext().getELFSection( - ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + if (isGP64bit()) + TOut.emitRRI(Mips::LD, FirstReg, TmpReg, 0, IDLoc, STI); + else { + TOut.emitRRI(Mips::LW, FirstReg, TmpReg, 0, IDLoc, STI); + TOut.emitRRI(Mips::LW, nextReg(FirstReg), TmpReg, 4, IDLoc, STI); + } + return false; +} - MCSymbol *Sym = getContext().createTempSymbol(); - const MCExpr *LoSym = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); - const MipsMCExpr *LoExpr = - MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); +bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU, + SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + assert(Inst.getNumOperands() == 2 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() && + "Invalid instruction operand."); + + unsigned FirstReg = Inst.getOperand(0).getReg(); + uint64_t ImmOp64 = Inst.getOperand(1).getImm(); - getStreamer().SwitchSection(ReadOnlySection); - getStreamer().EmitLabel(Sym, IDLoc); - getStreamer().EmitIntValue(HiImmOp64, 4); - getStreamer().EmitIntValue(LoImmOp64, 4); - getStreamer().SwitchSection(CS); + ImmOp64 = convertIntToDoubleImm(ImmOp64); - if(emitPartialAddress(TOut, IDLoc, Sym)) + unsigned TmpReg = Mips::ZERO; + if (ImmOp64 != 0) { + TmpReg = getATReg(IDLoc); + if (!TmpReg) return true; - if(isABI_N64()) - TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, - MCOperand::createExpr(LoExpr), IDLoc, STI); - else - TOut.emitRRX(Mips::ADDiu, ATReg, ATReg, - MCOperand::createExpr(LoExpr), IDLoc, STI); + } - if(isABI_N32() || isABI_N64()) - TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI); - else { - TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI); - TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI); - } - return false; - } else { // if(!IsGPR && !IsSingle) - if ((LoImmOp64 == 0) && - !((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) { - // FIXME: In the case where the constant is zero, we can load the - // register directly from the zero register. - if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc, + if ((Lo_32(ImmOp64) == 0) && + !((Hi_32(ImmOp64) & 0xffff0000) && (Hi_32(ImmOp64) & 0x0000ffff))) { + if (isGP64bit()) { + if (TmpReg != Mips::ZERO && + loadImmediate(ImmOp64, TmpReg, Mips::NoRegister, false, false, IDLoc, Out, STI)) return true; - if (isABI_N32() || isABI_N64()) - TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI); - else if (hasMips32r2()) { - TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI); - TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI); - } else { - TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI); - TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI); - } + TOut.emitRR(Mips::DMTC1, FirstReg, TmpReg, IDLoc, STI); return false; } - MCSection *CS = getStreamer().getCurrentSectionOnly(); - // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections - // where appropriate. - MCSection *ReadOnlySection = getContext().getELFSection( - ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + if (TmpReg != Mips::ZERO && + loadImmediate(Hi_32(ImmOp64), TmpReg, Mips::NoRegister, true, false, + IDLoc, Out, STI)) + return true; + + if (hasMips32r2()) { + TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI); + TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, TmpReg, IDLoc, STI); + } else { + TOut.emitRR(Mips::MTC1, nextReg(FirstReg), TmpReg, IDLoc, STI); + TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI); + } + return false; + } + + MCSection *CS = getStreamer().getCurrentSectionOnly(); + // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections + // where appropriate. + MCSection *ReadOnlySection = + getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); - MCSymbol *Sym = getContext().createTempSymbol(); - const MCExpr *LoSym = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); - const MipsMCExpr *LoExpr = - MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); + MCSymbol *Sym = getContext().createTempSymbol(); + const MCExpr *LoSym = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); + const MipsMCExpr *LoExpr = + MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); - getStreamer().SwitchSection(ReadOnlySection); - getStreamer().EmitLabel(Sym, IDLoc); - getStreamer().EmitIntValue(HiImmOp64, 4); - getStreamer().EmitIntValue(LoImmOp64, 4); - getStreamer().SwitchSection(CS); + getStreamer().SwitchSection(ReadOnlySection); + getStreamer().EmitLabel(Sym, IDLoc); + getStreamer().EmitValueToAlignment(8); + getStreamer().EmitIntValue(ImmOp64, 8); + getStreamer().SwitchSection(CS); + + if (emitPartialAddress(TOut, IDLoc, Sym)) + return true; + + TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, TmpReg, + MCOperand::createExpr(LoExpr), IDLoc, STI); - if(emitPartialAddress(TOut, IDLoc, Sym)) - return true; - TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg, - MCOperand::createExpr(LoExpr), IDLoc, STI); - } return false; } @@ -3489,7 +3545,7 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, } else { if (!isInt<17>(Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << 1)) + if (offsetToAlignment(Offset.getImm(), Align(2))) return Error(IDLoc, "branch to misaligned address"); Inst.clear(); Inst.setOpcode(Mips::BEQ_MM); @@ -3581,7 +3637,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, assert(DstRegOp.isReg() && "expected register operand kind"); const MCOperand &BaseRegOp = Inst.getOperand(1); assert(BaseRegOp.isReg() && "expected register operand kind"); - const MCOperand &OffsetOp = Inst.getOperand(2); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned DstReg = DstRegOp.getReg(); @@ -3603,6 +3658,26 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return; } + if (Inst.getNumOperands() > 3) { + const MCOperand &BaseRegOp = Inst.getOperand(2); + assert(BaseRegOp.isReg() && "expected register operand kind"); + const MCOperand &ExprOp = Inst.getOperand(3); + assert(ExprOp.isExpr() && "expected expression oprand kind"); + + unsigned BaseReg = BaseRegOp.getReg(); + const MCExpr *ExprOffset = ExprOp.getExpr(); + + MCOperand LoOperand = MCOperand::createExpr( + MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext())); + MCOperand HiOperand = MCOperand::createExpr( + MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext())); + TOut.emitSCWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand, + LoOperand, TmpReg, IDLoc, STI); + return; + } + + const MCOperand &OffsetOp = Inst.getOperand(2); + if (OffsetOp.isImm()) { int64_t LoOffset = OffsetOp.getImm() & 0xffff; int64_t HiOffset = OffsetOp.getImm() & ~0xffff; @@ -3625,21 +3700,54 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI); TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI); - } else { - assert(OffsetOp.isExpr() && "expected expression operand kind"); - const MCExpr *ExprOffset = OffsetOp.getExpr(); - MCOperand LoOperand = MCOperand::createExpr( - MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext())); - MCOperand HiOperand = MCOperand::createExpr( - MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext())); + return; + } - if (IsLoad) - TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand, - LoOperand, TmpReg, IDLoc, STI); - else - TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand, - LoOperand, TmpReg, IDLoc, STI); + if (OffsetOp.isExpr()) { + if (inPicMode()) { + // FIXME: + // c) Check that immediates of R_MIPS_GOT16/R_MIPS_LO16 relocations + // do not exceed 16-bit. + // d) Use R_MIPS_GOT_PAGE/R_MIPS_GOT_OFST relocations instead + // of R_MIPS_GOT_DISP in appropriate cases to reduce number + // of GOT entries. + MCValue Res; + if (!OffsetOp.getExpr()->evaluateAsRelocatable(Res, nullptr, nullptr)) { + Error(IDLoc, "expected relocatable expression"); + return; + } + if (Res.getSymB() != nullptr) { + Error(IDLoc, "expected relocatable expression with only one symbol"); + return; + } + + loadAndAddSymbolAddress(Res.getSymA(), TmpReg, BaseReg, + !ABI.ArePtrs64bit(), IDLoc, Out, STI); + TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, Res.getConstant(), IDLoc, + STI); + } else { + // FIXME: Implement 64-bit case. + // 1) lw $8, sym => lui $8, %hi(sym) + // lw $8, %lo(sym)($8) + // 2) sw $8, sym => lui $at, %hi(sym) + // sw $8, %lo(sym)($at) + const MCExpr *ExprOffset = OffsetOp.getExpr(); + MCOperand LoOperand = MCOperand::createExpr( + MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext())); + MCOperand HiOperand = MCOperand::createExpr( + MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext())); + + // Generate the base address in TmpReg. + TOut.emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI); + if (BaseReg != Mips::ZERO) + TOut.emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI); + // Emit the load or store with the adjusted base and offset. + TOut.emitRRX(Inst.getOpcode(), DstReg, TmpReg, LoOperand, IDLoc, STI); + } + return; } + + llvm_unreachable("unexpected operand type"); } bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, @@ -6976,7 +7084,7 @@ bool MipsAsmParser::parseSetPushDirective() { // Create a copy of the current assembler options environment and push it. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get())); + std::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get())); getTargetStreamer().emitDirectiveSetPush(); return false; diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index ef13507fe63a..c3e98fe410c1 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -267,6 +267,13 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, uint64_t Address, const void *Decoder); +// DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target, +// which is shifted left by 2 bit. +static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -2291,6 +2298,15 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2; + Inst.addOperand(MCOperand::createImm(JumpOffset)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst, unsigned Value, uint64_t Address, diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 859f9cbbca07..70f2a7bdf10f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -304,7 +304,6 @@ Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const { return StringSwitch<Optional<MCFixupKind>>(Name) .Case("R_MIPS_NONE", FK_NONE) .Case("R_MIPS_32", FK_Data_4) - .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE) .Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16) .Case("R_MIPS_CALL_LO16", (MCFixupKind)Mips::fixup_Mips_CALL_LO16) .Case("R_MIPS_CALL16", (MCFixupKind)Mips::fixup_Mips_CALL16) diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 4d7e36995ae4..cca75dfc45c2 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -66,9 +66,9 @@ public: /// fixupNeedsRelaxation - Target specific predicate for whether a given /// fixup requires the associated instruction to be relaxed. - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override { + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { // FIXME. llvm_unreachable("RelaxInstruction() unimplemented"); return false; diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index cf7bae98a27f..cc3168790b98 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -219,7 +219,7 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx, const MCFixup &Fixup, bool IsPCRel) const { // Determine the type of the relocation. - unsigned Kind = (unsigned)Fixup.getKind(); + unsigned Kind = Fixup.getTargetKind(); switch (Kind) { case FK_NONE: @@ -690,6 +690,6 @@ llvm::createMipsELFObjectWriter(const Triple &TT, bool IsN32) { uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); bool IsN64 = TT.isArch64Bit() && !IsN32; bool HasRelocationAddend = TT.isArch64Bit(); - return llvm::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend, + return std::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend, IsN64); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 759a7fdb32b8..142e9cebb79e 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -485,8 +485,11 @@ getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo, assert(MO.isExpr() && "getJumpOffset16OpValue expects only expressions or an immediate"); - // TODO: Push fixup. - return 0; + const MCExpr *Expr = MO.getExpr(); + Mips::Fixups FixupKind = + isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16 : Mips::fixup_Mips_LO16; + Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind))); + return 0; } /// getJumpTargetOpValue - Return binary encoding of the jump diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h index ad5aff6552f6..a84ca8ccfb2d 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h @@ -10,11 +10,12 @@ #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H #include "llvm/MC/MCELFStreamer.h" +#include "llvm/Support/Alignment.h" namespace llvm { -// Log2 of the NaCl MIPS sandbox's instruction bundle size. -static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u; +// NaCl MIPS sandbox's instruction bundle size. +static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16); bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx, bool *IsStore = nullptr); diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index ddeec03ba784..79c47d1b6508 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -143,12 +143,15 @@ public: return false; switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) { case MCOI::OPERAND_UNKNOWN: - case MCOI::OPERAND_IMMEDIATE: - // jal, bal ... - Target = Inst.getOperand(NumOps - 1).getImm(); + case MCOI::OPERAND_IMMEDIATE: { + // j, jal, jalx, jals + // Absolute branch within the current 256 MB-aligned region + uint64_t Region = Addr & ~uint64_t(0xfffffff); + Target = Region + Inst.getOperand(NumOps - 1).getImm(); return true; + } case MCOI::OPERAND_PCREL: - // b, j, beq ... + // b, beq ... Target = Addr + Inst.getOperand(NumOps - 1).getImm(); return true; default: diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index c050db8a17fd..2d53750ad0ee 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -270,7 +270,7 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, S->getAssembler().setRelaxAll(true); // Set bundle-alignment as required by the NaCl ABI for the target. - S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN); + S->EmitBundleAlignMode(Log2(MIPS_NACL_BUNDLE_ALIGN)); return S; } diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp index b4ebb9d18b72..3ff9c722484b 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp @@ -37,7 +37,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() { Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS, ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, ""); MCA.registerSection(*Sec); - Sec->setAlignment(8); + Sec->setAlignment(Align(8)); Streamer->SwitchSection(Sec); Streamer->EmitIntValue(ELF::ODK_REGINFO, 1); // kind @@ -55,7 +55,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() { MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC, 24, ""); MCA.registerSection(*Sec); - Sec->setAlignment(MTS->getABI().IsN32() ? 8 : 4); + Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4)); Streamer->SwitchSection(Sec); Streamer->EmitIntValue(ri_gprmask, 4); diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index e3bdb3b140a8..b6dae9f6dea8 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -34,6 +34,15 @@ static cl::opt<bool> RoundSectionSizes( cl::desc("Round section sizes up to the section alignment"), cl::Hidden); } // end anonymous namespace +static bool isMipsR6(const MCSubtargetInfo *STI) { + return STI->getFeatureBits()[Mips::FeatureMips32r6] || + STI->getFeatureBits()[Mips::FeatureMips64r6]; +} + +static bool isMicroMips(const MCSubtargetInfo *STI) { + return STI->getFeatureBits()[Mips::FeatureMicroMips]; +} + MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) : MCTargetStreamer(S), GPReg(Mips::GP), ModuleDirectiveAllowed(true) { GPRInfoSet = FPRInfoSet = FrameInfoSet = false; @@ -216,6 +225,19 @@ void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI); } +void MipsTargetStreamer::emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, + unsigned Reg2, MCOperand Op3, SMLoc IDLoc, + const MCSubtargetInfo *STI) { + MCInst TmpInst; + TmpInst.setOpcode(Opcode); + TmpInst.addOperand(MCOperand::createReg(Reg0)); + TmpInst.addOperand(MCOperand::createReg(Reg1)); + TmpInst.addOperand(MCOperand::createReg(Reg2)); + TmpInst.addOperand(Op3); + TmpInst.setLoc(IDLoc); + getStreamer().EmitInstruction(TmpInst, *STI); +} + void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, SMLoc IDLoc, const MCSubtargetInfo *STI) { @@ -264,8 +286,7 @@ void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc, } void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) { - const FeatureBitset &Features = STI->getFeatureBits(); - if (Features[Mips::FeatureMicroMips]) + if (isMicroMips(STI)) emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI); else emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI); @@ -311,21 +332,34 @@ void MipsTargetStreamer::emitStoreWithImmOffset( emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI); } -/// Emit a store instruction with an symbol offset. Symbols are assumed to be -/// out of range for a simm16 will be expanded to appropriate instructions. -void MipsTargetStreamer::emitStoreWithSymOffset( - unsigned Opcode, unsigned SrcReg, unsigned BaseReg, MCOperand &HiOperand, - MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc, - const MCSubtargetInfo *STI) { - // sw $8, sym => lui $at, %hi(sym) - // sw $8, %lo(sym)($at) +/// Emit a store instruction with an symbol offset. +void MipsTargetStreamer::emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg, + unsigned BaseReg, + MCOperand &HiOperand, + MCOperand &LoOperand, + unsigned ATReg, SMLoc IDLoc, + const MCSubtargetInfo *STI) { + // sc $8, sym => lui $at, %hi(sym) + // sc $8, %lo(sym)($at) // Generate the base address in ATReg. emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI); - if (BaseReg != Mips::ZERO) - emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI); - // Emit the store with the adjusted base and offset. - emitRRX(Opcode, SrcReg, ATReg, LoOperand, IDLoc, STI); + if (!isMicroMips(STI) && isMipsR6(STI)) { + // For non-micromips r6 offset for 'sc' is not in the lower 16 bits so we + // put it in 'at'. + // sc $8, sym => lui $at, %hi(sym) + // addiu $at, $at, %lo(sym) + // sc $8, 0($at) + emitRRX(Mips::ADDiu, ATReg, ATReg, LoOperand, IDLoc, STI); + MCOperand Offset = MCOperand::createImm(0); + // Emit the store with the adjusted base and offset. + emitRRRX(Opcode, SrcReg, SrcReg, ATReg, Offset, IDLoc, STI); + } else { + if (BaseReg != Mips::ZERO) + emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI); + // Emit the store with the adjusted base and offset. + emitRRRX(Opcode, SrcReg, SrcReg, ATReg, LoOperand, IDLoc, STI); + } } /// Emit a load instruction with an immediate offset. DstReg and TmpReg are @@ -364,30 +398,6 @@ void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, emitRRI(Opcode, DstReg, TmpReg, LoOffset, IDLoc, STI); } -/// Emit a load instruction with an symbol offset. Symbols are assumed to be -/// out of range for a simm16 will be expanded to appropriate instructions. -/// DstReg and TmpReg are permitted to be the same register iff DstReg is a -/// GPR. It is the callers responsibility to identify such cases and pass the -/// appropriate register in TmpReg. -void MipsTargetStreamer::emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, - unsigned BaseReg, - MCOperand &HiOperand, - MCOperand &LoOperand, - unsigned TmpReg, SMLoc IDLoc, - const MCSubtargetInfo *STI) { - // 1) lw $8, sym => lui $8, %hi(sym) - // lw $8, %lo(sym)($8) - // 2) ldc1 $f0, sym => lui $at, %hi(sym) - // ldc1 $f0, %lo(sym)($at) - - // Generate the base address in TmpReg. - emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI); - if (BaseReg != Mips::ZERO) - emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI); - // Emit the load with the adjusted base and offset. - emitRRX(Opcode, DstReg, TmpReg, LoOperand, IDLoc, STI); -} - MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : MipsTargetStreamer(S), OS(OS) {} @@ -891,9 +901,9 @@ void MipsTargetELFStreamer::finish() { MCSection &BSSSection = *OFI.getBSSSection(); MCA.registerSection(BSSSection); - TextSection.setAlignment(std::max(16u, TextSection.getAlignment())); - DataSection.setAlignment(std::max(16u, DataSection.getAlignment())); - BSSSection.setAlignment(std::max(16u, BSSSection.getAlignment())); + TextSection.setAlignment(Align(std::max(16u, TextSection.getAlignment()))); + DataSection.setAlignment(Align(std::max(16u, DataSection.getAlignment()))); + BSSSection.setAlignment(Align(std::max(16u, BSSSection.getAlignment()))); if (RoundSectionSizes) { // Make sections sizes a multiple of the alignment. This is useful for @@ -1016,7 +1026,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context); MCA.registerSection(*Sec); - Sec->setAlignment(4); + Sec->setAlignment(Align(4)); OS.PushSection(); @@ -1306,7 +1316,7 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() { MCSectionELF *Sec = Context.getELFSection( ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, ""); MCA.registerSection(*Sec); - Sec->setAlignment(8); + Sec->setAlignment(Align(8)); OS.SwitchSection(Sec); OS << ABIFlagsSection; diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td index 5a12568893af..9a1e47e5ecca 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td +++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -360,7 +360,7 @@ class RDDSP_MM_DESC { dag OutOperandList = (outs GPR32Opnd:$rt); dag InOperandList = (ins uimm7:$mask); string AsmString = !strconcat("rddsp", "\t$rt, $mask"); - list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt7:$mask))]; + list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp timmZExt7:$mask))]; InstrItinClass Itinerary = NoItinerary; } @@ -383,7 +383,7 @@ class WRDSP_MM_DESC { dag OutOperandList = (outs); dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask); string AsmString = !strconcat("wrdsp", "\t$rt, $mask"); - list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)]; + list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, timmZExt7:$mask)]; InstrItinClass Itinerary = NoItinerary; bit isMoveReg = 1; } diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index 9b7f7b25fa94..8cc0029fc896 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -955,17 +955,18 @@ let DecoderNamespace = "MicroMips" in { EXT_FM_MM<0x0c>, ISA_MICROMIPS32_NOT_MIPS32R6; /// Jump Instructions - let DecoderMethod = "DecodeJumpTargetMM" in + let DecoderMethod = "DecodeJumpTargetMM" in { def J_MM : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">, J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>, IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6; - - let DecoderMethod = "DecodeJumpTargetMM" in { def JAL_MM : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>, ISA_MICROMIPS32_NOT_MIPS32R6; + } + + let DecoderMethod = "DecodeJumpTargetXMM" in def JALX_MM : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>, ISA_MICROMIPS32_NOT_MIPS32R6; - } + def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>, ISA_MICROMIPS32_NOT_MIPS32R6; def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>, diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp index 70af95592aa5..db93b3d80ede 100644 --- a/lib/Target/Mips/MicroMipsSizeReduction.cpp +++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp @@ -361,7 +361,7 @@ static bool CheckXWPInstr(MachineInstr *MI, bool ReduceToLwp, MI->getOpcode() == Mips::SW16_MM)) return false; - unsigned reg = MI->getOperand(0).getReg(); + Register reg = MI->getOperand(0).getReg(); if (reg == Mips::RA) return false; @@ -403,8 +403,8 @@ static bool ConsecutiveInstr(MachineInstr *MI1, MachineInstr *MI2) { if (!GetImm(MI2, 2, Offset2)) return false; - unsigned Reg1 = MI1->getOperand(0).getReg(); - unsigned Reg2 = MI2->getOperand(0).getReg(); + Register Reg1 = MI1->getOperand(0).getReg(); + Register Reg2 = MI2->getOperand(0).getReg(); return ((Offset1 == (Offset2 - 4)) && (ConsecutiveRegisters(Reg1, Reg2))); } @@ -475,8 +475,8 @@ bool MicroMipsSizeReduce::ReduceXWtoXWP(ReduceEntryFunArgs *Arguments) { if (!CheckXWPInstr(MI2, ReduceToLwp, Entry)) return false; - unsigned Reg1 = MI1->getOperand(1).getReg(); - unsigned Reg2 = MI2->getOperand(1).getReg(); + Register Reg1 = MI1->getOperand(1).getReg(); + Register Reg2 = MI2->getOperand(1).getReg(); if (Reg1 != Reg2) return false; @@ -621,8 +621,8 @@ bool MicroMipsSizeReduce::ReduceMoveToMovep(ReduceEntryFunArgs *Arguments) { MachineInstr *MI1 = Arguments->MI; MachineInstr *MI2 = &*NextMII; - unsigned RegDstMI1 = MI1->getOperand(0).getReg(); - unsigned RegSrcMI1 = MI1->getOperand(1).getReg(); + Register RegDstMI1 = MI1->getOperand(0).getReg(); + Register RegSrcMI1 = MI1->getOperand(1).getReg(); if (!IsMovepSrcRegister(RegSrcMI1)) return false; @@ -633,8 +633,8 @@ bool MicroMipsSizeReduce::ReduceMoveToMovep(ReduceEntryFunArgs *Arguments) { if (MI2->getOpcode() != Entry.WideOpc()) return false; - unsigned RegDstMI2 = MI2->getOperand(0).getReg(); - unsigned RegSrcMI2 = MI2->getOperand(1).getReg(); + Register RegDstMI2 = MI2->getOperand(0).getReg(); + Register RegSrcMI2 = MI2->getOperand(1).getReg(); if (!IsMovepSrcRegister(RegSrcMI2)) return false; diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index 7b83ea8535ae..a5908362e81f 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -25,6 +25,8 @@ class PredicateControl { list<Predicate> GPRPredicates = []; // Predicates for the PTR size such as IsPTR64bit list<Predicate> PTRPredicates = []; + // Predicates for a symbol's size such as hasSym32. + list<Predicate> SYMPredicates = []; // Predicates for the FGR size and layout such as IsFP64bit list<Predicate> FGRPredicates = []; // Predicates for the instruction group membership such as ISA's. @@ -38,6 +40,7 @@ class PredicateControl { list<Predicate> Predicates = !listconcat(EncodingPredicates, GPRPredicates, PTRPredicates, + SYMPredicates, FGRPredicates, InsnPredicates, HardFloatPredicate, @@ -206,6 +209,9 @@ def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">; def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true", "Disable use of the jal instruction">; +def FeatureXGOT + : SubtargetFeature<"xgot", "UseXGOT", "true", "Assume 32-bit GOT">; + def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard", "UseIndirectJumpsHazard", "true", "Use indirect jump" @@ -257,3 +263,9 @@ def Mips : Target { let AssemblyParserVariants = [MipsAsmParserVariant]; let AllowRegisterRenaming = 1; } + +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "MipsPfmCounters.td" diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index 3ab4f1e064da..768d54fc9c24 100644 --- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -72,7 +72,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MachineRegisterInfo &RegInfo = MF.getRegInfo(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL; - unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg(); + Register V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass; V0 = RegInfo.createVirtualRegister(RC); diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp index 6d8e5aef2a3f..5a5b78c9d5f9 100644 --- a/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/lib/Target/Mips/Mips16ISelLowering.cpp @@ -708,8 +708,8 @@ Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc, if (DontExpandCondPseudos16) return BB; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - unsigned regX = MI.getOperand(0).getReg(); - unsigned regY = MI.getOperand(1).getReg(); + Register regX = MI.getOperand(0).getReg(); + Register regY = MI.getOperand(1).getReg(); MachineBasicBlock *target = MI.getOperand(2).getMBB(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc)) .addReg(regX) @@ -725,7 +725,7 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins( if (DontExpandCondPseudos16) return BB; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - unsigned regX = MI.getOperand(0).getReg(); + Register regX = MI.getOperand(0).getReg(); int64_t imm = MI.getOperand(1).getImm(); MachineBasicBlock *target = MI.getOperand(2).getMBB(); unsigned CmpOpc; @@ -758,9 +758,9 @@ Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI, if (DontExpandCondPseudos16) return BB; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - unsigned CC = MI.getOperand(0).getReg(); - unsigned regX = MI.getOperand(1).getReg(); - unsigned regY = MI.getOperand(2).getReg(); + Register CC = MI.getOperand(0).getReg(); + Register regX = MI.getOperand(1).getReg(); + Register regY = MI.getOperand(2).getReg(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)) .addReg(regX) .addReg(regY); @@ -777,8 +777,8 @@ Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc, if (DontExpandCondPseudos16) return BB; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - unsigned CC = MI.getOperand(0).getReg(); - unsigned regX = MI.getOperand(1).getReg(); + Register CC = MI.getOperand(0).getReg(); + Register regX = MI.getOperand(1).getReg(); int64_t Imm = MI.getOperand(2).getImm(); unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)).addReg(regX).addImm(Imm); diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp index c234c309d760..0d735c20ec2f 100644 --- a/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/lib/Target/Mips/Mips16InstrInfo.cpp @@ -358,7 +358,7 @@ unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm, for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { MachineOperand &MO = II->getOperand(i); if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + !Register::isVirtualRegister(MO.getReg())) Candidates.reset(MO.getReg()); } diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 7f35280f7936..cc15949b0d57 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -16,6 +16,7 @@ // shamt must fit in 6 bits. def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>; +def timmZExt6 : TImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>; // Node immediate fits as 10-bit sign extended on target immediate. // e.g. seqi, snei @@ -651,6 +652,7 @@ def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>, let AdditionalPredicates = [NotInMicroMips] in { def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)), (JAL texternalsym:$dst)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)), (LUi64 tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64; def : MipsPat<(MipsHighest (i64 tblockaddress:$in)), @@ -682,6 +684,20 @@ let AdditionalPredicates = [NotInMicroMips] in { (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64; def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))), (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 texternalsym:$lo))), + (DADDiu GPR64:$hi, texternalsym:$lo)>, + ISA_MIPS3, GPR_64, SYM_64; + + def : MipsPat<(MipsHi (i64 tglobaladdr:$in)), + (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsHi (i64 tblockaddress:$in)), + (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsHi (i64 tjumptable:$in)), + (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsHi (i64 tconstpool:$in)), + (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsHi (i64 texternalsym:$in)), + (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64; def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))), (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64; @@ -692,6 +708,23 @@ let AdditionalPredicates = [NotInMicroMips] in { (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64; def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))), (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(add GPR64:$hi, (MipsHi (i64 texternalsym:$lo))), + (DADDiu GPR64:$hi, texternalsym:$lo)>, + ISA_MIPS3, GPR_64, SYM_64; + + def : MipsPat<(MipsLo (i64 tglobaladdr:$in)), + (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsLo (i64 tblockaddress:$in)), + (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsLo (i64 tjumptable:$in)), + (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsLo (i64 tconstpool:$in)), + (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsLo (i64 tglobaltlsaddr:$in)), + (DADDiu ZERO_64, tglobaltlsaddr:$in)>, + ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(MipsLo (i64 texternalsym:$in)), + (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64; def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))), (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64; @@ -705,6 +738,9 @@ let AdditionalPredicates = [NotInMicroMips] in { def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))), (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MIPS3, GPR_64, SYM_64; + def : MipsPat<(add GPR64:$hi, (MipsLo (i64 texternalsym:$lo))), + (DADDiu GPR64:$hi, texternalsym:$lo)>, + ISA_MIPS3, GPR_64, SYM_64; } // gp_rel relocs diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index db83fe49cec0..2201545adc94 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -56,6 +56,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include <cassert> #include <cstdint> @@ -376,7 +377,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() { void MipsAsmPrinter::emitFrameDirective() { const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo(); - unsigned stackReg = RI.getFrameRegister(*MF); + Register stackReg = RI.getFrameRegister(*MF); unsigned returnReg = RI.getRARegister(); unsigned stackSize = MF->getFrameInfo().getStackSize(); @@ -571,7 +572,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, // for 2 for 32 bit mode and 1 for 64 bit mode. if (NumVals != 2) { if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); O << '$' << MipsInstPrinter::getRegisterName(Reg); return false; } @@ -597,7 +598,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const MachineOperand &MO = MI->getOperand(RegOp); if (!MO.isReg()) return true; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); O << '$' << MipsInstPrinter::getRegisterName(Reg); return false; } @@ -780,7 +781,7 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); StringRef FS = TM.getTargetFeatureString(); const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM); - const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, 0); + const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, None); bool IsABICalls = STI.isABICalls(); const MipsABIInfo &ABI = MTM.getABI(); @@ -821,6 +822,9 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { // option has changed the default (i.e. FPXX) and omit it otherwise. if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX())) TS.emitDirectiveModuleOddSPReg(); + + // Switch to the .text section. + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); } void MipsAsmPrinter::emitInlineAsmStart() const { diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp index da65689ecff5..cad82953af50 100644 --- a/lib/Target/Mips/MipsCallLowering.cpp +++ b/lib/Target/Mips/MipsCallLowering.cpp @@ -106,6 +106,7 @@ private: Register ArgsReg, const EVT &VT) override; virtual void markPhysRegUsed(unsigned PhysReg) { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } @@ -357,7 +358,7 @@ bool OutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs, return true; } -static bool isSupportedType(Type *T) { +static bool isSupportedArgumentType(Type *T) { if (T->isIntegerTy()) return true; if (T->isPointerTy()) @@ -367,6 +368,18 @@ static bool isSupportedType(Type *T) { return false; } +static bool isSupportedReturnType(Type *T) { + if (T->isIntegerTy()) + return true; + if (T->isPointerTy()) + return true; + if (T->isFloatingPointTy()) + return true; + if (T->isAggregateType()) + return true; + return false; +} + static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT, const ISD::ArgFlagsTy &Flags) { // > does not mean loss of information as type RegisterVT can't hold type VT, @@ -403,7 +416,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA); - if (Val != nullptr && !isSupportedType(Val->getType())) + if (Val != nullptr && !isSupportedReturnType(Val->getType())) return false; if (!VRegs.empty()) { @@ -411,21 +424,13 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Function &F = MF.getFunction(); const DataLayout &DL = MF.getDataLayout(); const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>(); - LLVMContext &Ctx = Val->getType()->getContext(); - - SmallVector<EVT, 4> SplitEVTs; - ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); - assert(VRegs.size() == SplitEVTs.size() && - "For each split Type there should be exactly one VReg."); SmallVector<ArgInfo, 8> RetInfos; SmallVector<unsigned, 8> OrigArgIndices; - for (unsigned i = 0; i < SplitEVTs.size(); ++i) { - ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)}; - setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); - splitToValueTypes(CurArgInfo, 0, RetInfos, OrigArgIndices); - } + ArgInfo ArgRetInfo(VRegs, Val->getType()); + setArgFlags(ArgRetInfo, AttributeList::ReturnIndex, DL, F); + splitToValueTypes(DL, ArgRetInfo, 0, RetInfos, OrigArgIndices); SmallVector<ISD::OutputArg, 8> Outs; subTargetRegTypeForCallingConv(F, RetInfos, OrigArgIndices, Outs); @@ -453,12 +458,8 @@ bool MipsCallLowering::lowerFormalArguments( if (F.arg_empty()) return true; - if (F.isVarArg()) { - return false; - } - for (auto &Arg : F.args()) { - if (!isSupportedType(Arg.getType())) + if (!isSupportedArgumentType(Arg.getType())) return false; } @@ -472,7 +473,8 @@ bool MipsCallLowering::lowerFormalArguments( for (auto &Arg : F.args()) { ArgInfo AInfo(VRegs[i], Arg.getType()); setArgFlags(AInfo, i + AttributeList::FirstArgIndex, DL, F); - splitToValueTypes(AInfo, i, ArgInfos, OrigArgIndices); + ArgInfos.push_back(AInfo); + OrigArgIndices.push_back(i); ++i; } @@ -495,30 +497,64 @@ bool MipsCallLowering::lowerFormalArguments( if (!Handler.handle(ArgLocs, ArgInfos)) return false; + if (F.isVarArg()) { + ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs(); + unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs); + + int VaArgOffset; + unsigned RegSize = 4; + if (ArgRegs.size() == Idx) + VaArgOffset = alignTo(CCInfo.getNextStackOffset(), RegSize); + else { + VaArgOffset = + (int)ABI.GetCalleeAllocdArgSizeInBytes(CCInfo.getCallingConv()) - + (int)(RegSize * (ArgRegs.size() - Idx)); + } + + MachineFrameInfo &MFI = MF.getFrameInfo(); + int FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true); + MF.getInfo<MipsFunctionInfo>()->setVarArgsFrameIndex(FI); + + for (unsigned I = Idx; I < ArgRegs.size(); ++I, VaArgOffset += RegSize) { + MIRBuilder.getMBB().addLiveIn(ArgRegs[I]); + + MachineInstrBuilder Copy = + MIRBuilder.buildCopy(LLT::scalar(RegSize * 8), Register(ArgRegs[I])); + FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true); + MachinePointerInfo MPO = MachinePointerInfo::getFixedStack(MF, FI); + MachineInstrBuilder FrameIndex = + MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, RegSize, + /* Alignment */ RegSize); + MIRBuilder.buildStore(Copy, FrameIndex, *MMO); + } + } + return true; } bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const { + CallLoweringInfo &Info) const { - if (CallConv != CallingConv::C) + if (Info.CallConv != CallingConv::C) return false; - for (auto &Arg : OrigArgs) { - if (!isSupportedType(Arg.Ty)) + for (auto &Arg : Info.OrigArgs) { + if (!isSupportedArgumentType(Arg.Ty)) return false; - if (Arg.Flags.isByVal() || Arg.Flags.isSRet()) + if (Arg.Flags[0].isByVal()) + return false; + if (Arg.Flags[0].isSRet() && !Arg.Ty->isPointerTy()) return false; } - if (OrigRet.Regs[0] && !isSupportedType(OrigRet.Ty)) + if (!Info.OrigRet.Ty->isVoidTy() && !isSupportedReturnType(Info.OrigRet.Ty)) return false; MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); + const DataLayout &DL = MF.getDataLayout(); const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>(); const MipsTargetMachine &TM = static_cast<const MipsTargetMachine &>(MF.getTarget()); @@ -528,37 +564,38 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN); const bool IsCalleeGlobalPIC = - Callee.isGlobal() && TM.isPositionIndependent(); + Info.Callee.isGlobal() && TM.isPositionIndependent(); MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert( - Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL); + Info.Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL); MIB.addDef(Mips::SP, RegState::Implicit); if (IsCalleeGlobalPIC) { Register CalleeReg = MF.getRegInfo().createGenericVirtualRegister(LLT::pointer(0, 32)); MachineInstr *CalleeGlobalValue = - MIRBuilder.buildGlobalValue(CalleeReg, Callee.getGlobal()); - if (!Callee.getGlobal()->hasLocalLinkage()) + MIRBuilder.buildGlobalValue(CalleeReg, Info.Callee.getGlobal()); + if (!Info.Callee.getGlobal()->hasLocalLinkage()) CalleeGlobalValue->getOperand(1).setTargetFlags(MipsII::MO_GOT_CALL); MIB.addUse(CalleeReg); } else - MIB.add(Callee); + MIB.add(Info.Callee); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv())); TargetLowering::ArgListTy FuncOrigArgs; - FuncOrigArgs.reserve(OrigArgs.size()); + FuncOrigArgs.reserve(Info.OrigArgs.size()); SmallVector<ArgInfo, 8> ArgInfos; SmallVector<unsigned, 8> OrigArgIndices; unsigned i = 0; - for (auto &Arg : OrigArgs) { + for (auto &Arg : Info.OrigArgs) { TargetLowering::ArgListEntry Entry; Entry.Ty = Arg.Ty; FuncOrigArgs.push_back(Entry); - splitToValueTypes(Arg, i, ArgInfos, OrigArgIndices); + ArgInfos.push_back(Arg); + OrigArgIndices.push_back(i); ++i; } @@ -566,11 +603,17 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, subTargetRegTypeForCallingConv(F, ArgInfos, OrigArgIndices, Outs); SmallVector<CCValAssign, 8> ArgLocs; - MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, + bool IsCalleeVarArg = false; + if (Info.Callee.isGlobal()) { + const Function *CF = static_cast<const Function *>(Info.Callee.getGlobal()); + IsCalleeVarArg = CF->isVarArg(); + } + MipsCCState CCInfo(F.getCallingConv(), IsCalleeVarArg, MF, ArgLocs, F.getContext()); - CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1); - const char *Call = Callee.isSymbol() ? Callee.getSymbolName() : nullptr; + CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv), 1); + const char *Call = + Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr; CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call); setLocInfo(ArgLocs, Outs); @@ -599,11 +642,11 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, *STI.getRegBankInfo()); } - if (OrigRet.Regs[0]) { + if (!Info.OrigRet.Ty->isVoidTy()) { ArgInfos.clear(); SmallVector<unsigned, 8> OrigRetIndices; - splitToValueTypes(OrigRet, 0, ArgInfos, OrigRetIndices); + splitToValueTypes(DL, Info.OrigRet, 0, ArgInfos, OrigRetIndices); SmallVector<ISD::InputArg, 8> Ins; subTargetRegTypeForCallingConv(F, ArgInfos, OrigRetIndices, Ins); @@ -612,7 +655,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); - CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), OrigRet.Ty, Call); + CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), Info.OrigRet.Ty, Call); setLocInfo(ArgLocs, Ins); CallReturnHandler Handler(MIRBuilder, MF.getRegInfo(), MIB); @@ -642,12 +685,12 @@ void MipsCallLowering::subTargetRegTypeForCallingConv( F.getContext(), F.getCallingConv(), VT); for (unsigned i = 0; i < NumRegs; ++i) { - ISD::ArgFlagsTy Flags = Arg.Flags; + ISD::ArgFlagsTy Flags = Arg.Flags[0]; if (i == 0) Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL)); else - Flags.setOrigAlign(1); + Flags.setOrigAlign(Align::None()); ISDArgs.emplace_back(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo], 0); @@ -657,12 +700,21 @@ void MipsCallLowering::subTargetRegTypeForCallingConv( } void MipsCallLowering::splitToValueTypes( - const ArgInfo &OrigArg, unsigned OriginalIndex, + const DataLayout &DL, const ArgInfo &OrigArg, unsigned OriginalIndex, SmallVectorImpl<ArgInfo> &SplitArgs, SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const { - // TODO : perform structure and array split. For now we only deal with - // types that pass isSupportedType check. - SplitArgs.push_back(OrigArg); - SplitArgsOrigIndices.push_back(OriginalIndex); + SmallVector<EVT, 4> SplitEVTs; + SmallVector<Register, 4> SplitVRegs; + const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>(); + LLVMContext &Ctx = OrigArg.Ty->getContext(); + + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitEVTs); + + for (unsigned i = 0; i < SplitEVTs.size(); ++i) { + ArgInfo Info = ArgInfo{OrigArg.Regs[i], SplitEVTs[i].getTypeForEVT(Ctx)}; + Info.Flags = OrigArg.Flags; + SplitArgs.push_back(Info); + SplitArgsOrigIndices.push_back(OriginalIndex); + } } diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h index 11c2d53ad35d..a284cf5e26cf 100644 --- a/lib/Target/Mips/MipsCallLowering.h +++ b/lib/Target/Mips/MipsCallLowering.h @@ -68,9 +68,8 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; private: /// Based on registers available on target machine split or extend @@ -83,7 +82,8 @@ private: /// Split structures and arrays, save original argument indices since /// Mips calling convention needs info about original argument type. - void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex, + void splitToValueTypes(const DataLayout &DL, const ArgInfo &OrigArg, + unsigned OriginalIndex, SmallVectorImpl<ArgInfo> &SplitArgs, SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const; }; diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp index eea28df7eda1..f50640521738 100644 --- a/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -222,12 +222,7 @@ namespace { BasicBlockInfo() = default; - // FIXME: ignore LogAlign for this patch - // - unsigned postOffset(unsigned LogAlign = 0) const { - unsigned PO = Offset + Size; - return PO; - } + unsigned postOffset() const { return Offset + Size; } }; std::vector<BasicBlockInfo> BBInfo; @@ -376,7 +371,7 @@ namespace { void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs); CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); - unsigned getCPELogAlign(const MachineInstr &CPEMI); + Align getCPEAlign(const MachineInstr &CPEMI); void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs); unsigned getOffsetOf(MachineInstr *MI) const; unsigned getUserOffset(CPUser&) const; @@ -534,11 +529,11 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { MF->push_back(BB); // MachineConstantPool measures alignment in bytes. We measure in log2(bytes). - unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment()); + const Align MaxAlign(MCP->getConstantPoolAlignment()); // Mark the basic block as required by the const-pool. // If AlignConstantIslands isn't set, use 4-byte alignment for everything. - BB->setAlignment(AlignConstantIslands ? MaxAlign : 2); + BB->setAlignment(AlignConstantIslands ? MaxAlign : Align(4)); // The function needs to be as aligned as the basic blocks. The linker may // move functions around based on their alignment. @@ -548,7 +543,8 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { // alignment of all entries as long as BB is sufficiently aligned. Keep // track of the insertion point for each alignment. We are going to bucket // sort the entries as they are created. - SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end()); + SmallVector<MachineBasicBlock::iterator, 8> InsPoint(Log2(MaxAlign) + 1, + BB->end()); // Add all of the constants from the constant pool to the end block, use an // identity mapping of CPI's to CPE's. @@ -576,7 +572,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { // Ensure that future entries with higher alignment get inserted before // CPEMI. This is bucket sort with iterators. - for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a) + for (unsigned a = LogAlign + 1; a <= Log2(MaxAlign); ++a) if (InsPoint[a] == InsAt) InsPoint[a] = CPEMI; // Add a new CPEntry, but no corresponding CPUser yet. @@ -621,20 +617,18 @@ MipsConstantIslands::CPEntry return nullptr; } -/// getCPELogAlign - Returns the required alignment of the constant pool entry +/// getCPEAlign - Returns the required alignment of the constant pool entry /// represented by CPEMI. Alignment is measured in log2(bytes) units. -unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr &CPEMI) { +Align MipsConstantIslands::getCPEAlign(const MachineInstr &CPEMI) { assert(CPEMI.getOpcode() == Mips::CONSTPOOL_ENTRY); // Everything is 4-byte aligned unless AlignConstantIslands is set. if (!AlignConstantIslands) - return 2; + return Align(4); unsigned CPI = CPEMI.getOperand(1).getIndex(); assert(CPI < MCP->getConstants().size() && "Invalid constant pool index."); - unsigned Align = MCP->getConstants()[CPI].getAlignment(); - assert(isPowerOf2_32(Align) && "Invalid CPE alignment"); - return Log2_32(Align); + return Align(MCP->getConstants()[CPI].getAlignment()); } /// initializeFunctionInfo - Do the initial scan of the function, building up @@ -940,13 +934,13 @@ bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset, bool MipsConstantIslands::isWaterInRange(unsigned UserOffset, MachineBasicBlock* Water, CPUser &U, unsigned &Growth) { - unsigned CPELogAlign = getCPELogAlign(*U.CPEMI); - unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); - unsigned NextBlockOffset, NextBlockAlignment; + unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(); + unsigned NextBlockOffset; + Align NextBlockAlignment; MachineFunction::const_iterator NextBlock = ++Water->getIterator(); if (NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); - NextBlockAlignment = 0; + NextBlockAlignment = Align::None(); } else { NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset; NextBlockAlignment = NextBlock->getAlignment(); @@ -961,7 +955,7 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset, Growth = CPEEnd - NextBlockOffset; // Compute the padding that would go at the end of the CPE to align the next // block. - Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment); + Growth += offsetToAlignment(CPEEnd, NextBlockAlignment); // If the CPE is to be inserted before the instruction, that will raise // the offset of the instruction. Also account for unknown alignment padding @@ -1221,7 +1215,6 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, CPUser &U = CPUsers[CPUserIndex]; MachineInstr *UserMI = U.MI; MachineInstr *CPEMI = U.CPEMI; - unsigned CPELogAlign = getCPELogAlign(*CPEMI); MachineBasicBlock *UserMBB = UserMI->getParent(); const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; @@ -1231,7 +1224,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, // Size of branch to insert. unsigned Delta = 2; // Compute the offset where the CPE will begin. - unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta; + unsigned CPEOffset = UserBBI.postOffset() + Delta; if (isOffsetInRange(UserOffset, CPEOffset, U)) { LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB) @@ -1257,9 +1250,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, // Try to split the block so it's fully aligned. Compute the latest split // point where we can add a 4-byte branch instruction, and then align to - // LogAlign which is the largest possible alignment in the function. - unsigned LogAlign = MF->getAlignment(); - assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry"); + // Align which is the largest possible alignment in the function. + const Align Align = MF->getAlignment(); unsigned BaseInsertOffset = UserOffset + U.getMaxDisp(); LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x", BaseInsertOffset)); @@ -1270,7 +1262,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, BaseInsertOffset -= 4; LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset) - << " la=" << LogAlign << '\n'); + << " la=" << Log2(Align) << '\n'); // This could point off the end of the block if we've already got constant // pool entries following this block; only the last one is in the water list. @@ -1295,8 +1287,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, CPUser &U = CPUsers[CPUIndex]; if (!isOffsetInRange(Offset, EndInsertOffset, U)) { // Shift intertion point by one unit of alignment so it is within reach. - BaseInsertOffset -= 1u << LogAlign; - EndInsertOffset -= 1u << LogAlign; + BaseInsertOffset -= Align.value(); + EndInsertOffset -= Align.value(); } // This is overly conservative, as we don't account for CPEMIs being // reused within the block, but it doesn't matter much. Also assume CPEs @@ -1399,7 +1391,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { ++NumCPEs; // Mark the basic block as aligned as required by the const-pool entry. - NewIsland->setAlignment(getCPELogAlign(*U.CPEMI)); + NewIsland->setAlignment(getCPEAlign(*U.CPEMI)); // Increase the size of the island block to account for the new entry. BBInfo[NewIsland->getNumber()].Size += Size; @@ -1431,10 +1423,11 @@ void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { BBInfo[CPEBB->getNumber()].Size = 0; // This block no longer needs to be aligned. - CPEBB->setAlignment(0); - } else + CPEBB->setAlignment(Align(1)); + } else { // Entries are sorted by descending alignment, so realign from the front. - CPEBB->setAlignment(getCPELogAlign(*CPEBB->begin())); + CPEBB->setAlignment(getCPEAlign(*CPEBB->begin())); + } adjustBBOffsetsAfter(CPEBB); // An island has only one predecessor BB and one successor BB. Check if @@ -1529,7 +1522,7 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) { // We should have a way to back out this alignment restriction if we "can" later. // but it is not harmful. // - DestBB->setAlignment(2); + DestBB->setAlignment(Align(4)); Br.MaxDisp = ((1<<24)-1) * 2; MI->setDesc(TII->get(Mips::JalB16)); } diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td index daca8b907081..d3e68c014fb7 100644 --- a/lib/Target/Mips/MipsDSPInstrInfo.td +++ b/lib/Target/Mips/MipsDSPInstrInfo.td @@ -12,12 +12,19 @@ // ImmLeaf def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>; +def timmZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}], NOOP_SDNodeXForm, timm>; def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>; +def timmZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}], NOOP_SDNodeXForm, timm>; def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>; +def timmZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}], NOOP_SDNodeXForm, timm>; def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>; +def timmZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}], NOOP_SDNodeXForm, timm>; def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>; +def timmZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}], NOOP_SDNodeXForm, timm>; def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>; +def timmZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}], NOOP_SDNodeXForm, timm>; def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>; +def timmSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}], NOOP_SDNodeXForm, timm>; def immSExt10 : ImmLeaf<i32, [{return isInt<10>(Imm);}]>; // Mips-specific dsp nodes @@ -306,7 +313,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag OutOperandList = (outs ROT:$rt); dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src); string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa"); - list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))]; + list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, timmZExt5:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; string BaseOpcode = instr_asm; @@ -443,7 +450,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag OutOperandList = (outs GPR32Opnd:$rd); dag InOperandList = (ins uimm10:$mask); string AsmString = !strconcat(instr_asm, "\t$rd, $mask"); - list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))]; + list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode timmZExt10:$mask))]; InstrItinClass Itinerary = itin; string BaseOpcode = instr_asm; bit isMoveReg = 1; @@ -454,7 +461,7 @@ class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag OutOperandList = (outs); dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask); string AsmString = !strconcat(instr_asm, "\t$rs, $mask"); - list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)]; + list<dag> Pattern = [(OpNode GPR32Opnd:$rs, timmZExt10:$mask)]; InstrItinClass Itinerary = itin; string BaseOpcode = instr_asm; bit isMoveReg = 1; @@ -1096,14 +1103,14 @@ class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>; // Misc -class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5, +class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, timmZExt5, NoItinerary>; -class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2, +class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, timmZExt2, NoItinerary>; class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5, - immZExt5, NoItinerary>; + timmZExt5, NoItinerary>; // Pseudos. def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp index 65d84a6c44a0..00cd284e7094 100644 --- a/lib/Target/Mips/MipsExpandPseudo.cpp +++ b/lib/Target/Mips/MipsExpandPseudo.cpp @@ -99,15 +99,15 @@ bool MipsExpandPseudo::expandAtomicCmpSwapSubword( : (ArePtrs64bit ? Mips::SC64 : Mips::SC); } - unsigned Dest = I->getOperand(0).getReg(); - unsigned Ptr = I->getOperand(1).getReg(); - unsigned Mask = I->getOperand(2).getReg(); - unsigned ShiftCmpVal = I->getOperand(3).getReg(); - unsigned Mask2 = I->getOperand(4).getReg(); - unsigned ShiftNewVal = I->getOperand(5).getReg(); - unsigned ShiftAmnt = I->getOperand(6).getReg(); - unsigned Scratch = I->getOperand(7).getReg(); - unsigned Scratch2 = I->getOperand(8).getReg(); + Register Dest = I->getOperand(0).getReg(); + Register Ptr = I->getOperand(1).getReg(); + Register Mask = I->getOperand(2).getReg(); + Register ShiftCmpVal = I->getOperand(3).getReg(); + Register Mask2 = I->getOperand(4).getReg(); + Register ShiftNewVal = I->getOperand(5).getReg(); + Register ShiftAmnt = I->getOperand(6).getReg(); + Register Scratch = I->getOperand(7).getReg(); + Register Scratch2 = I->getOperand(8).getReg(); // insert new blocks after the current block const BasicBlock *LLVM_BB = BB.getBasicBlock(); @@ -240,11 +240,11 @@ bool MipsExpandPseudo::expandAtomicCmpSwap(MachineBasicBlock &BB, MOVE = Mips::OR64; } - unsigned Dest = I->getOperand(0).getReg(); - unsigned Ptr = I->getOperand(1).getReg(); - unsigned OldVal = I->getOperand(2).getReg(); - unsigned NewVal = I->getOperand(3).getReg(); - unsigned Scratch = I->getOperand(4).getReg(); + Register Dest = I->getOperand(0).getReg(); + Register Ptr = I->getOperand(1).getReg(); + Register OldVal = I->getOperand(2).getReg(); + Register NewVal = I->getOperand(3).getReg(); + Register Scratch = I->getOperand(4).getReg(); // insert new blocks after the current block const BasicBlock *LLVM_BB = BB.getBasicBlock(); @@ -374,15 +374,15 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( llvm_unreachable("Unknown subword atomic pseudo for expansion!"); } - unsigned Dest = I->getOperand(0).getReg(); - unsigned Ptr = I->getOperand(1).getReg(); - unsigned Incr = I->getOperand(2).getReg(); - unsigned Mask = I->getOperand(3).getReg(); - unsigned Mask2 = I->getOperand(4).getReg(); - unsigned ShiftAmnt = I->getOperand(5).getReg(); - unsigned OldVal = I->getOperand(6).getReg(); - unsigned BinOpRes = I->getOperand(7).getReg(); - unsigned StoreVal = I->getOperand(8).getReg(); + Register Dest = I->getOperand(0).getReg(); + Register Ptr = I->getOperand(1).getReg(); + Register Incr = I->getOperand(2).getReg(); + Register Mask = I->getOperand(3).getReg(); + Register Mask2 = I->getOperand(4).getReg(); + Register ShiftAmnt = I->getOperand(5).getReg(); + Register OldVal = I->getOperand(6).getReg(); + Register BinOpRes = I->getOperand(7).getReg(); + Register StoreVal = I->getOperand(8).getReg(); const BasicBlock *LLVM_BB = BB.getBasicBlock(); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -513,10 +513,10 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB, BEQ = Mips::BEQ64; } - unsigned OldVal = I->getOperand(0).getReg(); - unsigned Ptr = I->getOperand(1).getReg(); - unsigned Incr = I->getOperand(2).getReg(); - unsigned Scratch = I->getOperand(3).getReg(); + Register OldVal = I->getOperand(0).getReg(); + Register Ptr = I->getOperand(1).getReg(); + Register Incr = I->getOperand(2).getReg(); + Register Scratch = I->getOperand(3).getReg(); unsigned Opcode = 0; unsigned OR = 0; diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 123d3cc242f0..80f288ac500c 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -1162,14 +1162,20 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, if (ArgVT == MVT::f32) { VA.convertToReg(Mips::F12); } else if (ArgVT == MVT::f64) { - VA.convertToReg(Mips::D6); + if (Subtarget->isFP64bit()) + VA.convertToReg(Mips::D6_64); + else + VA.convertToReg(Mips::D6); } } else if (i == 1) { if ((firstMVT == MVT::f32) || (firstMVT == MVT::f64)) { if (ArgVT == MVT::f32) { VA.convertToReg(Mips::F14); } else if (ArgVT == MVT::f64) { - VA.convertToReg(Mips::D7); + if (Subtarget->isFP64bit()) + VA.convertToReg(Mips::D7_64); + else + VA.convertToReg(Mips::D7); } } } @@ -1722,7 +1728,7 @@ bool MipsFastISel::selectRet(const Instruction *I) { return false; unsigned SrcReg = Reg + VA.getValNo(); - unsigned DestReg = VA.getLocReg(); + Register DestReg = VA.getLocReg(); // Avoid a cross-class copy. This is very unlikely. if (!MRI.getRegClass(SrcReg)->contains(DestReg)) return false; diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index 0537cfd1cb30..612b2b712fa8 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -24,8 +24,9 @@ protected: const MipsSubtarget &STI; public: - explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment) - : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {} + explicit MipsFrameLowering(const MipsSubtarget &sti, Align Alignment) + : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) { + } static const MipsFrameLowering *create(const MipsSubtarget &ST); diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 9ba54d6bb73c..e5997af3bcc5 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -65,7 +65,7 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { /// getGlobalBaseReg - Output the instructions required to put the /// GOT address into a register. SDNode *MipsDAGToDAGISel::getGlobalBaseReg() { - unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg(); + Register GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg(); return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy( CurDAG->getDataLayout())) .getNode(); @@ -217,6 +217,51 @@ bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const { return false; } +/// Convert vector addition with vector subtraction if that allows to encode +/// constant as an immediate and thus avoid extra 'ldi' instruction. +/// add X, <-1, -1...> --> sub X, <1, 1...> +bool MipsDAGToDAGISel::selectVecAddAsVecSubIfProfitable(SDNode *Node) { + assert(Node->getOpcode() == ISD::ADD && "Should only get 'add' here."); + + EVT VT = Node->getValueType(0); + assert(VT.isVector() && "Should only be called for vectors."); + + SDValue X = Node->getOperand(0); + SDValue C = Node->getOperand(1); + + auto *BVN = dyn_cast<BuildVectorSDNode>(C); + if (!BVN) + return false; + + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, + 8, !Subtarget->isLittle())) + return false; + + auto IsInlineConstant = [](const APInt &Imm) { return Imm.isIntN(5); }; + + if (IsInlineConstant(SplatValue)) + return false; // Can already be encoded as an immediate. + + APInt NegSplatValue = 0 - SplatValue; + if (!IsInlineConstant(NegSplatValue)) + return false; // Even if we negate it it won't help. + + SDLoc DL(Node); + + SDValue NegC = CurDAG->FoldConstantArithmetic( + ISD::SUB, DL, VT, CurDAG->getConstant(0, DL, VT).getNode(), C.getNode()); + assert(NegC && "Constant-folding failed!"); + SDValue NewNode = CurDAG->getNode(ISD::SUB, DL, VT, X, NegC); + + ReplaceNode(Node, NewNode.getNode()); + SelectCode(NewNode.getNode()); + return true; +} + /// Select instructions not customized! Used for /// expanded, promoted and normal instructions void MipsDAGToDAGISel::Select(SDNode *Node) { @@ -236,6 +281,12 @@ void MipsDAGToDAGISel::Select(SDNode *Node) { switch(Opcode) { default: break; + case ISD::ADD: + if (Node->getSimpleValueType(0).isVector() && + selectVecAddAsVecSubIfProfitable(Node)) + return; + break; + // Get target GOT address. case ISD::GLOBAL_OFFSET_TABLE: ReplaceNode(Node, getGlobalBaseReg()); diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h index bae3bbf71f3b..a768589b374b 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.h +++ b/lib/Target/Mips/MipsISelDAGToDAG.h @@ -125,6 +125,11 @@ private: /// starting at bit zero. virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const; + /// Convert vector addition with vector subtraction if that allows to encode + /// constant as an immediate and thus avoid extra 'ldi' instruction. + /// add X, <-1, -1...> --> sub X, <1, 1...> + bool selectVecAddAsVecSubIfProfitable(SDNode *Node); + void Select(SDNode *N) override; virtual bool trySelect(SDNode *Node) = 0; diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 0ff09007da4b..bf1b4756b24f 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -83,10 +83,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt<bool> -LargeGOT("mxgot", cl::Hidden, - cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false)); - -static cl::opt<bool> NoZeroDivCheck("mno-check-zero-division", cl::Hidden, cl::desc("MIPS: Don't trap on integer division by zero."), cl::init(false)); @@ -330,7 +326,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, } // Set LoadExtAction for f16 vectors to Expand - for (MVT VT : MVT::fp_vector_valuetypes()) { + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) { MVT F16VT = MVT::getVectorVT(MVT::f16, VT.getVectorNumElements()); if (F16VT.isValid()) setLoadExtAction(ISD::EXTLOAD, VT, F16VT, Expand); @@ -518,11 +514,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); } - setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2); + setMinFunctionAlignment(Subtarget.isGP64bit() ? Align(8) : Align(4)); // The arguments on the stack are defined in terms of 4-byte slots on O32 // and 8-byte slots on N32/N64. - setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? 8 : 4); + setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? Align(8) + : Align(4)); setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP); @@ -552,8 +549,9 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, !Subtarget.inMicroMipsMode(); // Disable if either of the following is true: - // We do not generate PIC, the ABI is not O32, LargeGOT is being used. - if (!TM.isPositionIndependent() || !TM.getABI().IsO32() || LargeGOT) + // We do not generate PIC, the ABI is not O32, XGOT is being used. + if (!TM.isPositionIndependent() || !TM.getABI().IsO32() || + Subtarget.useXGOT()) UseFastISel = false; return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr; @@ -1257,7 +1255,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const static unsigned addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC) { - unsigned VReg = MF.getRegInfo().createVirtualRegister(RC); + Register VReg = MF.getRegInfo().createVirtualRegister(RC); MF.getRegInfo().addLiveIn(PReg, VReg); return VReg; } @@ -1477,10 +1475,10 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr &MI, llvm_unreachable("Unknown pseudo atomic for replacement!"); } - unsigned OldVal = MI.getOperand(0).getReg(); - unsigned Ptr = MI.getOperand(1).getReg(); - unsigned Incr = MI.getOperand(2).getReg(); - unsigned Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal)); + Register OldVal = MI.getOperand(0).getReg(); + Register Ptr = MI.getOperand(1).getReg(); + Register Incr = MI.getOperand(2).getReg(); + Register Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal)); MachineBasicBlock::iterator II(MI); @@ -1519,8 +1517,8 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr &MI, // containing the word. // - unsigned PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr)); - unsigned IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr)); + Register PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr)); + Register IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr)); BuildMI(*BB, II, DL, TII->get(Mips::COPY), IncrCopy).addReg(Incr); BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr); @@ -1556,7 +1554,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg( MachineFunction *MF = BB->getParent(); MachineRegisterInfo &RegInfo = MF->getRegInfo(); const TargetRegisterClass *RC = getRegClassFor(MVT::i32); - unsigned ScrReg = RegInfo.createVirtualRegister(RC); + Register ScrReg = RegInfo.createVirtualRegister(RC); assert(Size < 32); int64_t ShiftImm = 32 - (Size * 8); @@ -1581,21 +1579,21 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword( const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Ptr = MI.getOperand(1).getReg(); - unsigned Incr = MI.getOperand(2).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Ptr = MI.getOperand(1).getReg(); + Register Incr = MI.getOperand(2).getReg(); - unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp); - unsigned ShiftAmt = RegInfo.createVirtualRegister(RC); - unsigned Mask = RegInfo.createVirtualRegister(RC); - unsigned Mask2 = RegInfo.createVirtualRegister(RC); - unsigned Incr2 = RegInfo.createVirtualRegister(RC); - unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp); - unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC); - unsigned MaskUpper = RegInfo.createVirtualRegister(RC); - unsigned Scratch = RegInfo.createVirtualRegister(RC); - unsigned Scratch2 = RegInfo.createVirtualRegister(RC); - unsigned Scratch3 = RegInfo.createVirtualRegister(RC); + Register AlignedAddr = RegInfo.createVirtualRegister(RCp); + Register ShiftAmt = RegInfo.createVirtualRegister(RC); + Register Mask = RegInfo.createVirtualRegister(RC); + Register Mask2 = RegInfo.createVirtualRegister(RC); + Register Incr2 = RegInfo.createVirtualRegister(RC); + Register MaskLSB2 = RegInfo.createVirtualRegister(RCp); + Register PtrLSB2 = RegInfo.createVirtualRegister(RC); + Register MaskUpper = RegInfo.createVirtualRegister(RC); + Register Scratch = RegInfo.createVirtualRegister(RC); + Register Scratch2 = RegInfo.createVirtualRegister(RC); + Register Scratch3 = RegInfo.createVirtualRegister(RC); unsigned AtomicOp = 0; switch (MI.getOpcode()) { @@ -1678,7 +1676,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword( if (Subtarget.isLittle()) { BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3); } else { - unsigned Off = RegInfo.createVirtualRegister(RC); + Register Off = RegInfo.createVirtualRegister(RC); BuildMI(BB, DL, TII->get(Mips::XORi), Off) .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2); BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3); @@ -1738,12 +1736,12 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI, unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ? Mips::ATOMIC_CMP_SWAP_I32_POSTRA : Mips::ATOMIC_CMP_SWAP_I64_POSTRA; - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Ptr = MI.getOperand(1).getReg(); - unsigned OldVal = MI.getOperand(2).getReg(); - unsigned NewVal = MI.getOperand(3).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Ptr = MI.getOperand(1).getReg(); + Register OldVal = MI.getOperand(2).getReg(); + Register NewVal = MI.getOperand(3).getReg(); - unsigned Scratch = MRI.createVirtualRegister(RC); + Register Scratch = MRI.createVirtualRegister(RC); MachineBasicBlock::iterator II(MI); // We need to create copies of the various registers and kill them at the @@ -1751,9 +1749,9 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI, // after fast register allocation, the spills will end up outside of the // blocks that their values are defined in, causing livein errors. - unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr)); - unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal)); - unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal)); + Register PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr)); + Register OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal)); + Register NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal)); BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr); BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal); @@ -1790,22 +1788,22 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword( const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Ptr = MI.getOperand(1).getReg(); - unsigned CmpVal = MI.getOperand(2).getReg(); - unsigned NewVal = MI.getOperand(3).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Ptr = MI.getOperand(1).getReg(); + Register CmpVal = MI.getOperand(2).getReg(); + Register NewVal = MI.getOperand(3).getReg(); - unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp); - unsigned ShiftAmt = RegInfo.createVirtualRegister(RC); - unsigned Mask = RegInfo.createVirtualRegister(RC); - unsigned Mask2 = RegInfo.createVirtualRegister(RC); - unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC); - unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC); - unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp); - unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC); - unsigned MaskUpper = RegInfo.createVirtualRegister(RC); - unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC); - unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC); + Register AlignedAddr = RegInfo.createVirtualRegister(RCp); + Register ShiftAmt = RegInfo.createVirtualRegister(RC); + Register Mask = RegInfo.createVirtualRegister(RC); + Register Mask2 = RegInfo.createVirtualRegister(RC); + Register ShiftedCmpVal = RegInfo.createVirtualRegister(RC); + Register ShiftedNewVal = RegInfo.createVirtualRegister(RC); + Register MaskLSB2 = RegInfo.createVirtualRegister(RCp); + Register PtrLSB2 = RegInfo.createVirtualRegister(RC); + Register MaskUpper = RegInfo.createVirtualRegister(RC); + Register MaskedCmpVal = RegInfo.createVirtualRegister(RC); + Register MaskedNewVal = RegInfo.createVirtualRegister(RC); unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I8 ? Mips::ATOMIC_CMP_SWAP_I8_POSTRA : Mips::ATOMIC_CMP_SWAP_I16_POSTRA; @@ -1820,8 +1818,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword( // value isn't a problem. // The Dead flag is needed as the value in scratch isn't used by any other // instruction. Kill isn't used as Dead is more precise. - unsigned Scratch = RegInfo.createVirtualRegister(RC); - unsigned Scratch2 = RegInfo.createVirtualRegister(RC); + Register Scratch = RegInfo.createVirtualRegister(RC); + Register Scratch2 = RegInfo.createVirtualRegister(RC); // insert new blocks after the current block const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -1859,7 +1857,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword( if (Subtarget.isLittle()) { BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3); } else { - unsigned Off = RegInfo.createVirtualRegister(RC); + Register Off = RegInfo.createVirtualRegister(RC); BuildMI(BB, DL, TII->get(Mips::XORi), Off) .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2); BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3); @@ -1967,10 +1965,10 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, // %gp_rel relocation return getAddrGPRel(N, SDLoc(N), Ty, DAG, ABI.IsN64()); - // %hi/%lo relocation + // %hi/%lo relocation return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG) - // %highest/%higher/%hi/%lo relocation - : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG); + // %highest/%higher/%hi/%lo relocation + : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG); } // Every other architecture would use shouldAssumeDSOLocal in here, but @@ -1987,7 +1985,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, if (GV->hasLocalLinkage()) return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64()); - if (LargeGOT) + if (Subtarget.useXGOT()) return getAddrGlobalLargeGOT( N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16, DAG.getEntryNode(), @@ -2149,7 +2147,8 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { EVT VT = Node->getValueType(0); SDValue Chain = Node->getOperand(0); SDValue VAListPtr = Node->getOperand(1); - unsigned Align = Node->getConstantOperandVal(3); + const Align Align = + llvm::MaybeAlign(Node->getConstantOperandVal(3)).valueOrOne(); const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); SDLoc DL(Node); unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4; @@ -2166,14 +2165,13 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { // when the pointer is still aligned from the last va_arg (or pair of // va_args for the i64 on O32 case). if (Align > getMinStackArgumentAlignment()) { - assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2"); + VAList = DAG.getNode( + ISD::ADD, DL, VAList.getValueType(), VAList, + DAG.getConstant(Align.value() - 1, DL, VAList.getValueType())); - VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, - DAG.getConstant(Align - 1, DL, VAList.getValueType())); - - VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList, - DAG.getConstant(-(int64_t)Align, DL, - VAList.getValueType())); + VAList = DAG.getNode( + ISD::AND, DL, VAList.getValueType(), VAList, + DAG.getConstant(-(int64_t)Align.value(), DL, VAList.getValueType())); } // Increment the pointer, VAList, to the next vaarg. @@ -2870,7 +2868,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, #include "MipsGenCallingConv.inc" CCAssignFn *MipsTargetLowering::CCAssignFnForCall() const{ - return CC_Mips; + return CC_Mips_FixedArg; } CCAssignFn *MipsTargetLowering::CCAssignFnForReturn() const{ @@ -3167,7 +3165,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg, DAG.getConstant(1, DL, MVT::i32)); if (!Subtarget.isLittle()) std::swap(Lo, Hi); - unsigned LocRegLo = VA.getLocReg(); + Register LocRegLo = VA.getLocReg(); unsigned LocRegHigh = getNextIntArgReg(LocRegLo); RegsToPass.push_back(std::make_pair(LocRegLo, Lo)); RegsToPass.push_back(std::make_pair(LocRegHigh, Hi)); @@ -3270,7 +3268,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (InternalLinkage) Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() || ABI.IsN64()); - else if (LargeGOT) { + else if (Subtarget.useXGOT()) { Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16, MipsII::MO_CALL_LO16, Chain, FuncInfo->callPtrInfo(Val)); @@ -3292,7 +3290,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!IsPIC) // static Callee = DAG.getTargetExternalSymbol( Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG); - else if (LargeGOT) { + else if (Subtarget.useXGOT()) { Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16, MipsII::MO_CALL_LO16, Chain, FuncInfo->callPtrInfo(Sym)); @@ -3523,7 +3521,7 @@ SDValue MipsTargetLowering::LowerFormalArguments( // Arguments stored on registers if (IsRegLoc) { MVT RegVT = VA.getLocVT(); - unsigned ArgReg = VA.getLocReg(); + Register ArgReg = VA.getLocReg(); const TargetRegisterClass *RC = getRegClassFor(RegVT); // Transform the arguments stored on @@ -4568,20 +4566,20 @@ MachineBasicBlock *MipsTargetLowering::emitPseudoD_SELECT(MachineInstr &MI, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { +Register MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { // Named registers is expected to be fairly rare. For now, just support $28 // since the linux kernel uses it. if (Subtarget.isGP64bit()) { - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<Register>(RegName) .Case("$28", Mips::GP_64) - .Default(0); + .Default(Register()); if (Reg) return Reg; } else { - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<Register>(RegName) .Case("$28", Mips::GP) - .Default(0); + .Default(Register()); if (Reg) return Reg; } diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 2db60e9801f1..0a5cddd45afb 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -304,11 +304,12 @@ class TargetRegisterClass; unsigned &NumIntermediates, MVT &RegisterVT) const override; /// Return the correct alignment for the current calling convention. - unsigned getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const override { + Align getABIAlignmentForCallingConv(Type *ArgTy, + DataLayout DL) const override { + const Align ABIAlign(DL.getABITypeAlignment(ArgTy)); if (ArgTy->isVectorTy()) - return std::min(DL.getABITypeAlignment(ArgTy), 8U); - return DL.getABITypeAlignment(ArgTy); + return std::min(ABIAlign, Align(8)); + return ABIAlign; } ISD::NodeType getExtendForAtomicOps() const override { @@ -347,8 +348,8 @@ class TargetRegisterClass; void HandleByVal(CCState *, unsigned &, unsigned) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index fbd56206b249..6bb25ee5754d 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -677,7 +677,8 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc, return MIB; } -bool MipsInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, +bool MipsInstrInfo::findCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { assert(!MI.isBundle() && "TargetInstrInfo::findCommutedOpIndices() can't handle bundles"); diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index a626c0c3fdb8..092a960b4ba7 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -148,7 +148,7 @@ public: MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc, MachineBasicBlock::iterator I) const; - bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, + bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; /// Perform target specific instruction verification. diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index a4e85a38ab28..58167e0f344d 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -211,9 +211,9 @@ def HasCnMips : Predicate<"Subtarget->hasCnMips()">, AssemblerPredicate<"FeatureCnMips">; def NotCnMips : Predicate<"!Subtarget->hasCnMips()">, AssemblerPredicate<"!FeatureCnMips">; -def IsSym32 : Predicate<"Subtarget->HasSym32()">, +def IsSym32 : Predicate<"Subtarget->hasSym32()">, AssemblerPredicate<"FeatureSym32">; -def IsSym64 : Predicate<"!Subtarget->HasSym32()">, +def IsSym64 : Predicate<"!Subtarget->hasSym32()">, AssemblerPredicate<"!FeatureSym32">; def IsN64 : Predicate<"Subtarget->isABI_N64()">; def IsNotN64 : Predicate<"!Subtarget->isABI_N64()">; @@ -1263,6 +1263,7 @@ def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>; // Node immediate fits as 7-bit zero extended on target immediate. def immZExt7 : PatLeaf<(imm), [{ return isUInt<7>(N->getZExtValue()); }]>; +def timmZExt7 : PatLeaf<(timm), [{ return isUInt<7>(N->getZExtValue()); }]>; // Node immediate fits as 16-bit zero extended on target immediate. // The LO16 param means that only the lower 16 bits of the node @@ -1295,6 +1296,7 @@ def immZExt32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>; // shamt field must fit in 5 bits. def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>; +def timmZExt5 : TImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>; def immZExt5Plus1 : PatLeaf<(imm), [{ return isUInt<5>(N->getZExtValue() - 1); @@ -3142,25 +3144,31 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu, def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>; def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>; - def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>; + def : MipsPat<(MipsLo tglobaladdr:$in), + (Addiu ZeroReg, tglobaladdr:$in)>; def : MipsPat<(MipsLo tblockaddress:$in), (Addiu ZeroReg, tblockaddress:$in)>; - def : MipsPat<(MipsLo tjumptable:$in), (Addiu ZeroReg, tjumptable:$in)>; - def : MipsPat<(MipsLo tconstpool:$in), (Addiu ZeroReg, tconstpool:$in)>; + def : MipsPat<(MipsLo tjumptable:$in), + (Addiu ZeroReg, tjumptable:$in)>; + def : MipsPat<(MipsLo tconstpool:$in), + (Addiu ZeroReg, tconstpool:$in)>; def : MipsPat<(MipsLo tglobaltlsaddr:$in), (Addiu ZeroReg, tglobaltlsaddr:$in)>; - def : MipsPat<(MipsLo texternalsym:$in), (Addiu ZeroReg, texternalsym:$in)>; + def : MipsPat<(MipsLo texternalsym:$in), + (Addiu ZeroReg, texternalsym:$in)>; def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)), - (Addiu GPROpnd:$hi, tglobaladdr:$lo)>; + (Addiu GPROpnd:$hi, tglobaladdr:$lo)>; def : MipsPat<(add GPROpnd:$hi, (MipsLo tblockaddress:$lo)), - (Addiu GPROpnd:$hi, tblockaddress:$lo)>; + (Addiu GPROpnd:$hi, tblockaddress:$lo)>; def : MipsPat<(add GPROpnd:$hi, (MipsLo tjumptable:$lo)), - (Addiu GPROpnd:$hi, tjumptable:$lo)>; + (Addiu GPROpnd:$hi, tjumptable:$lo)>; def : MipsPat<(add GPROpnd:$hi, (MipsLo tconstpool:$lo)), - (Addiu GPROpnd:$hi, tconstpool:$lo)>; + (Addiu GPROpnd:$hi, tconstpool:$lo)>; def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaltlsaddr:$lo)), - (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>; + (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>; + def : MipsPat<(add GPROpnd:$hi, (MipsLo texternalsym:$lo)), + (Addiu GPROpnd:$hi, texternalsym:$lo)>; } // wrapper_pic diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp index 45a47ad3c087..f8fc7cb0898b 100644 --- a/lib/Target/Mips/MipsInstructionSelector.cpp +++ b/lib/Target/Mips/MipsInstructionSelector.cpp @@ -17,6 +17,7 @@ #include "MipsTargetMachine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #define DEBUG_TYPE "mips-isel" @@ -33,7 +34,7 @@ public: MipsInstructionSelector(const MipsTargetMachine &TM, const MipsSubtarget &STI, const MipsRegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } private: @@ -44,6 +45,8 @@ private: const TargetRegisterClass * getRegClassForTypeOnBank(unsigned OpSize, const RegisterBank &RB, const RegisterBankInfo &RBI) const; + unsigned selectLoadStoreOpCode(MachineInstr &I, + MachineRegisterInfo &MRI) const; const MipsTargetMachine &TM; const MipsSubtarget &STI; @@ -84,7 +87,7 @@ MipsInstructionSelector::MipsInstructionSelector( bool MipsInstructionSelector::selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const { Register DstReg = I.getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + if (Register::isPhysicalRegister(DstReg)) return true; const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI); @@ -158,9 +161,15 @@ bool MipsInstructionSelector::materialize32BitImm(Register DestReg, APInt Imm, } /// Returning Opc indicates that we failed to select MIPS instruction opcode. -static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes, - unsigned RegBank, bool isFP64) { - bool isStore = Opc == TargetOpcode::G_STORE; +unsigned +MipsInstructionSelector::selectLoadStoreOpCode(MachineInstr &I, + MachineRegisterInfo &MRI) const { + STI.getRegisterInfo(); + const Register DestReg = I.getOperand(0).getReg(); + const unsigned RegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID(); + const unsigned MemSizeInBytes = (*I.memoperands_begin())->getSize(); + unsigned Opc = I.getOpcode(); + const bool isStore = Opc == TargetOpcode::G_STORE; if (RegBank == Mips::GPRBRegBankID) { if (isStore) switch (MemSizeInBytes) { @@ -192,10 +201,24 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes, case 4: return isStore ? Mips::SWC1 : Mips::LWC1; case 8: - if (isFP64) + if (STI.isFP64bit()) return isStore ? Mips::SDC164 : Mips::LDC164; else return isStore ? Mips::SDC1 : Mips::LDC1; + case 16: { + assert(STI.hasMSA() && "Vector instructions require target with MSA."); + const unsigned VectorElementSizeInBytes = + MRI.getType(DestReg).getElementType().getSizeInBytes(); + if (VectorElementSizeInBytes == 1) + return isStore ? Mips::ST_B : Mips::LD_B; + if (VectorElementSizeInBytes == 2) + return isStore ? Mips::ST_H : Mips::LD_H; + if (VectorElementSizeInBytes == 4) + return isStore ? Mips::ST_W : Mips::LD_W; + if (VectorElementSizeInBytes == 8) + return isStore ? Mips::ST_D : Mips::LD_D; + return Opc; + } default: return Opc; } @@ -203,8 +226,7 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes, return Opc; } -bool MipsInstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool MipsInstructionSelector::select(MachineInstr &I) { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); @@ -231,7 +253,7 @@ bool MipsInstructionSelector::select(MachineInstr &I, return true; } - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; MachineInstr *MI = nullptr; @@ -265,6 +287,11 @@ bool MipsInstructionSelector::select(MachineInstr &I, .add(I.getOperand(2)); break; } + case G_INTTOPTR: + case G_PTRTOINT: { + I.setDesc(TII.get(COPY)); + return selectCopy(I, MRI); + } case G_FRAME_INDEX: { MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) .add(I.getOperand(0)) @@ -279,12 +306,71 @@ bool MipsInstructionSelector::select(MachineInstr &I, .add(I.getOperand(1)); break; } + case G_BRJT: { + unsigned EntrySize = + MF.getJumpTableInfo()->getEntrySize(MF.getDataLayout()); + assert(isPowerOf2_32(EntrySize) && + "Non-power-of-two jump-table entry size not supported."); + + Register JTIndex = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineInstr *SLL = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SLL)) + .addDef(JTIndex) + .addUse(I.getOperand(2).getReg()) + .addImm(Log2_32(EntrySize)); + if (!constrainSelectedInstRegOperands(*SLL, TII, TRI, RBI)) + return false; + + Register DestAddress = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineInstr *ADDu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu)) + .addDef(DestAddress) + .addUse(I.getOperand(0).getReg()) + .addUse(JTIndex); + if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI)) + return false; + + Register Dest = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineInstr *LW = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW)) + .addDef(Dest) + .addUse(DestAddress) + .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_LO) + .addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo(), MachineMemOperand::MOLoad, 4, 4)); + if (!constrainSelectedInstRegOperands(*LW, TII, TRI, RBI)) + return false; + + if (MF.getTarget().isPositionIndependent()) { + Register DestTmp = MRI.createVirtualRegister(&Mips::GPR32RegClass); + LW->getOperand(0).setReg(DestTmp); + MachineInstr *ADDu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu)) + .addDef(Dest) + .addUse(DestTmp) + .addUse(MF.getInfo<MipsFunctionInfo>() + ->getGlobalBaseRegForGlobalISel()); + if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI)) + return false; + } + + MachineInstr *Branch = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoIndirectBranch)) + .addUse(Dest); + if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; + } + case G_BRINDIRECT: { + MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoIndirectBranch)) + .add(I.getOperand(0)); + break; + } case G_PHI: { const Register DestReg = I.getOperand(0).getReg(); const unsigned OpSize = MRI.getType(DestReg).getSizeInBits(); const TargetRegisterClass *DefRC = nullptr; - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) DefRC = TRI.getRegClass(DestReg); else DefRC = getRegClassForTypeOnBank(OpSize, @@ -297,26 +383,35 @@ bool MipsInstructionSelector::select(MachineInstr &I, case G_LOAD: case G_ZEXTLOAD: case G_SEXTLOAD: { - const Register DestReg = I.getOperand(0).getReg(); - const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID(); - const unsigned OpSize = MRI.getType(DestReg).getSizeInBits(); - const unsigned OpMemSizeInBytes = (*I.memoperands_begin())->getSize(); - - if (DestRegBank == Mips::GPRBRegBankID && OpSize != 32) + const unsigned NewOpc = selectLoadStoreOpCode(I, MRI); + if (NewOpc == I.getOpcode()) return false; - if (DestRegBank == Mips::FPRBRegBankID && OpSize != 32 && OpSize != 64) - return false; + MachineOperand BaseAddr = I.getOperand(1); + int64_t SignedOffset = 0; + // Try to fold load/store + G_GEP + G_CONSTANT + // %SignedOffset:(s32) = G_CONSTANT i32 16_bit_signed_immediate + // %Addr:(p0) = G_GEP %BaseAddr, %SignedOffset + // %LoadResult/%StoreSrc = load/store %Addr(p0) + // into: + // %LoadResult/%StoreSrc = NewOpc %BaseAddr(p0), 16_bit_signed_immediate - const unsigned NewOpc = selectLoadStoreOpCode( - I.getOpcode(), OpMemSizeInBytes, DestRegBank, STI.isFP64bit()); - if (NewOpc == I.getOpcode()) - return false; + MachineInstr *Addr = MRI.getVRegDef(I.getOperand(1).getReg()); + if (Addr->getOpcode() == G_GEP) { + MachineInstr *Offset = MRI.getVRegDef(Addr->getOperand(2).getReg()); + if (Offset->getOpcode() == G_CONSTANT) { + APInt OffsetValue = Offset->getOperand(1).getCImm()->getValue(); + if (OffsetValue.isSignedIntN(16)) { + BaseAddr = Addr->getOperand(1); + SignedOffset = OffsetValue.getSExtValue(); + } + } + } MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc)) .add(I.getOperand(0)) - .add(I.getOperand(1)) - .addImm(0) + .add(BaseAddr) + .addImm(SignedOffset) .addMemOperand(*I.memoperands_begin()); break; } @@ -356,6 +451,18 @@ bool MipsInstructionSelector::select(MachineInstr &I, .add(I.getOperand(3)); break; } + case G_IMPLICIT_DEF: { + MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::IMPLICIT_DEF)) + .add(I.getOperand(0)); + + // Set class based on register bank, there can be fpr and gpr implicit def. + MRI.setRegClass(MI->getOperand(0).getReg(), + getRegClassForTypeOnBank( + MRI.getType(I.getOperand(0).getReg()).getSizeInBits(), + *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI), + RBI)); + break; + } case G_CONSTANT: { MachineIRBuilder B(I); if (!materialize32BitImm(I.getOperand(0).getReg(), @@ -423,7 +530,7 @@ bool MipsInstructionSelector::select(MachineInstr &I, Opcode = Mips::TRUNC_W_S; else Opcode = STI.isFP64bit() ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32; - unsigned ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass); + Register ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass); MachineInstr *Trunc = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode)) .addDef(ResultInFPR) .addUse(I.getOperand(1).getReg()); @@ -496,6 +603,24 @@ bool MipsInstructionSelector::select(MachineInstr &I, I.eraseFromParent(); return true; } + case G_JUMP_TABLE: { + if (MF.getTarget().isPositionIndependent()) { + MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW)) + .addDef(I.getOperand(0).getReg()) + .addReg(MF.getInfo<MipsFunctionInfo>() + ->getGlobalBaseRegForGlobalISel()) + .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_GOT) + .addMemOperand( + MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad, 4, 4)); + } else { + MI = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi)) + .addDef(I.getOperand(0).getReg()) + .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_HI); + } + break; + } case G_ICMP: { struct Instr { unsigned Opcode; @@ -626,7 +751,7 @@ bool MipsInstructionSelector::select(MachineInstr &I, // MipsFCMPCondCode, result is inverted i.e. MOVT_I is used. unsigned MoveOpcode = isLogicallyNegated ? Mips::MOVT_I : Mips::MOVF_I; - unsigned TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); + Register TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) .addDef(TrueInReg) .addUse(Mips::ZERO) @@ -654,6 +779,33 @@ bool MipsInstructionSelector::select(MachineInstr &I, I.eraseFromParent(); return true; } + case G_FENCE: { + MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SYNC)).addImm(0); + break; + } + case G_VASTART: { + MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>(); + int FI = FuncInfo->getVarArgsFrameIndex(); + + Register LeaReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineInstr *LEA_ADDiu = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LEA_ADDiu)) + .addDef(LeaReg) + .addFrameIndex(FI) + .addImm(0); + if (!constrainSelectedInstRegOperands(*LEA_ADDiu, TII, TRI, RBI)) + return false; + + MachineInstr *Store = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SW)) + .addUse(LeaReg) + .addUse(I.getOperand(0).getReg()) + .addImm(0); + if (!constrainSelectedInstRegOperands(*Store, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; + } default: return false; } diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp index e442a81837ed..bb4a1d902d75 100644 --- a/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -16,18 +16,65 @@ using namespace llvm; +struct TypesAndMemOps { + LLT ValTy; + LLT PtrTy; + unsigned MemSize; + bool MustBeNaturallyAligned; +}; + +static bool +CheckTy0Ty1MemSizeAlign(const LegalityQuery &Query, + std::initializer_list<TypesAndMemOps> SupportedValues) { + for (auto &Val : SupportedValues) { + if (Val.ValTy != Query.Types[0]) + continue; + if (Val.PtrTy != Query.Types[1]) + continue; + if (Val.MemSize != Query.MMODescrs[0].SizeInBits) + continue; + if (Val.MustBeNaturallyAligned && + Query.MMODescrs[0].SizeInBits % Query.MMODescrs[0].AlignInBits != 0) + continue; + return true; + } + return false; +} + +static bool CheckTyN(unsigned N, const LegalityQuery &Query, + std::initializer_list<LLT> SupportedValues) { + for (auto &Val : SupportedValues) + if (Val == Query.Types[N]) + return true; + return false; +} + MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { using namespace TargetOpcode; const LLT s1 = LLT::scalar(1); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); + const LLT v16s8 = LLT::vector(16, 8); + const LLT v8s16 = LLT::vector(8, 16); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); const LLT p0 = LLT::pointer(0, 32); - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_SUB, G_MUL}) .legalFor({s32}) .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_ADD) + .legalIf([=, &ST](const LegalityQuery &Query) { + if (CheckTyN(0, Query, {s32})) + return true; + if (ST.hasMSA() && CheckTyN(0, Query, {v16s8, v8s16, v4s32, v2s64})) + return true; + return false; + }) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder({G_UADDO, G_UADDE, G_USUBO, G_USUBE, G_UMULO}) .lowerFor({{s32, s1}}); @@ -36,13 +83,26 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { .maxScalar(0, s32); getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalForTypesWithMemDesc({{s32, p0, 8, 8}, - {s32, p0, 16, 8}, - {s32, p0, 32, 8}, - {s64, p0, 64, 8}, - {p0, p0, 32, 8}}) + .legalIf([=, &ST](const LegalityQuery &Query) { + if (CheckTy0Ty1MemSizeAlign(Query, {{s32, p0, 8, ST.hasMips32r6()}, + {s32, p0, 16, ST.hasMips32r6()}, + {s32, p0, 32, ST.hasMips32r6()}, + {p0, p0, 32, ST.hasMips32r6()}, + {s64, p0, 64, ST.hasMips32r6()}})) + return true; + if (ST.hasMSA() && + CheckTy0Ty1MemSizeAlign(Query, {{v16s8, p0, 128, false}, + {v8s16, p0, 128, false}, + {v4s32, p0, 128, false}, + {v2s64, p0, 128, false}})) + return true; + return false; + }) .minScalar(0, s32); + getActionDefinitionsBuilder(G_IMPLICIT_DEF) + .legalFor({s32, s64}); + getActionDefinitionsBuilder(G_UNMERGE_VALUES) .legalFor({{s32, s64}}); @@ -50,9 +110,17 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { .legalFor({{s64, s32}}); getActionDefinitionsBuilder({G_ZEXTLOAD, G_SEXTLOAD}) - .legalForTypesWithMemDesc({{s32, p0, 8, 8}, - {s32, p0, 16, 8}}) - .minScalar(0, s32); + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, + {s32, p0, 16, 8}}) + .clampScalar(0, s32, s32); + + getActionDefinitionsBuilder({G_ZEXT, G_SEXT}) + .legalIf([](const LegalityQuery &Query) { return false; }) + .maxScalar(0, s32); + + getActionDefinitionsBuilder(G_TRUNC) + .legalIf([](const LegalityQuery &Query) { return false; }) + .maxScalar(1, s32); getActionDefinitionsBuilder(G_SELECT) .legalForCartesianProduct({p0, s32, s64}, {s32}) @@ -63,6 +131,12 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { .legalFor({s32}) .minScalar(0, s32); + getActionDefinitionsBuilder(G_BRJT) + .legalFor({{p0, s32}}); + + getActionDefinitionsBuilder(G_BRINDIRECT) + .legalFor({p0}); + getActionDefinitionsBuilder(G_PHI) .legalFor({p0, s32, s64}) .minScalar(0, s32); @@ -77,8 +151,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { .libcallFor({s64}); getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) - .legalFor({s32, s32}) - .minScalar(1, s32); + .legalFor({{s32, s32}}) + .clampScalar(1, s32, s32) + .clampScalar(0, s32, s32); getActionDefinitionsBuilder(G_ICMP) .legalForCartesianProduct({s32}, {s32, p0}) @@ -89,15 +164,24 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { .legalFor({s32}) .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_GEP) + getActionDefinitionsBuilder({G_GEP, G_INTTOPTR}) .legalFor({{p0, s32}}); + getActionDefinitionsBuilder(G_PTRTOINT) + .legalFor({{s32, p0}}); + getActionDefinitionsBuilder(G_FRAME_INDEX) .legalFor({p0}); - getActionDefinitionsBuilder(G_GLOBAL_VALUE) + getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE}) .legalFor({p0}); + getActionDefinitionsBuilder(G_DYN_STACKALLOC) + .lowerFor({{p0, s32}}); + + getActionDefinitionsBuilder(G_VASTART) + .legalFor({p0}); + // FP instructions getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({s32, s64}); @@ -126,6 +210,7 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { getActionDefinitionsBuilder(G_FPTOUI) .libcallForCartesianProduct({s64}, {s64, s32}) + .lowerForCartesianProduct({s32}, {s64, s32}) .minScalar(0, s32); // Int to FP conversion instructions @@ -136,8 +221,11 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { getActionDefinitionsBuilder(G_UITOFP) .libcallForCartesianProduct({s64, s32}, {s64}) + .customForCartesianProduct({s64, s32}, {s32}) .minScalar(1, s32); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -150,6 +238,134 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI, using namespace TargetOpcode; MIRBuilder.setInstr(MI); + const MipsSubtarget &STI = + static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget()); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); - return false; + switch (MI.getOpcode()) { + case G_UITOFP: { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + + if (SrcTy != s32) + return false; + if (DstTy != s32 && DstTy != s64) + return false; + + // Let 0xABCDEFGH be given unsigned in MI.getOperand(1). First let's convert + // unsigned to double. Mantissa has 52 bits so we use following trick: + // First make floating point bit mask 0x43300000ABCDEFGH. + // Mask represents 2^52 * 0x1.00000ABCDEFGH i.e. 0x100000ABCDEFGH.0 . + // Next, subtract 2^52 * 0x1.0000000000000 i.e. 0x10000000000000.0 from it. + // Done. Trunc double to float if needed. + + MachineInstrBuilder Bitcast = MIRBuilder.buildInstr( + STI.isFP64bit() ? Mips::BuildPairF64_64 : Mips::BuildPairF64, {s64}, + {Src, MIRBuilder.buildConstant(s32, UINT32_C(0x43300000))}); + Bitcast.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + + MachineInstrBuilder TwoP52FP = MIRBuilder.buildFConstant( + s64, BitsToDouble(UINT64_C(0x4330000000000000))); + + if (DstTy == s64) + MIRBuilder.buildFSub(Dst, Bitcast, TwoP52FP); + else { + MachineInstrBuilder ResF64 = MIRBuilder.buildFSub(s64, Bitcast, TwoP52FP); + MIRBuilder.buildFPTrunc(Dst, ResF64); + } + + MI.eraseFromParent(); + break; + } + default: + return false; + } + + return true; +} + +static bool SelectMSA3OpIntrinsic(MachineInstr &MI, unsigned Opcode, + MachineIRBuilder &MIRBuilder, + const MipsSubtarget &ST) { + assert(ST.hasMSA() && "MSA intrinsic not supported on target without MSA."); + if (!MIRBuilder.buildInstr(Opcode) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(), + *ST.getRegBankInfo())) + return false; + MI.eraseFromParent(); + return true; +} + +static bool MSA3OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode, + MachineIRBuilder &MIRBuilder, + const MipsSubtarget &ST) { + assert(ST.hasMSA() && "MSA intrinsic not supported on target without MSA."); + MIRBuilder.buildInstr(Opcode) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + MI.eraseFromParent(); + return true; +} + +bool MipsLegalizerInfo::legalizeIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + const MipsSubtarget &ST = + static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget()); + const MipsInstrInfo &TII = *ST.getInstrInfo(); + const MipsRegisterInfo &TRI = *ST.getRegisterInfo(); + const RegisterBankInfo &RBI = *ST.getRegBankInfo(); + MIRBuilder.setInstr(MI); + + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, MRI, MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + case Intrinsic::trap: { + MachineInstr *Trap = MIRBuilder.buildInstr(Mips::TRAP); + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*Trap, TII, TRI, RBI); + } + case Intrinsic::vacopy: { + Register Tmp = MRI.createGenericVirtualRegister(LLT::pointer(0, 32)); + MachinePointerInfo MPO; + MIRBuilder.buildLoad(Tmp, MI.getOperand(2), + *MI.getMF()->getMachineMemOperand( + MPO, MachineMemOperand::MOLoad, 4, 4)); + MIRBuilder.buildStore(Tmp, MI.getOperand(1), + *MI.getMF()->getMachineMemOperand( + MPO, MachineMemOperand::MOStore, 4, 4)); + MI.eraseFromParent(); + return true; + } + case Intrinsic::mips_addv_b: + case Intrinsic::mips_addv_h: + case Intrinsic::mips_addv_w: + case Intrinsic::mips_addv_d: + return MSA3OpIntrinsicToGeneric(MI, TargetOpcode::G_ADD, MIRBuilder, ST); + case Intrinsic::mips_addvi_b: + return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_B, MIRBuilder, ST); + case Intrinsic::mips_addvi_h: + return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_H, MIRBuilder, ST); + case Intrinsic::mips_addvi_w: + return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_W, MIRBuilder, ST); + case Intrinsic::mips_addvi_d: + return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_D, MIRBuilder, ST); + default: + break; + } + return true; } diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h index e5021e081890..9696c262b2db 100644 --- a/lib/Target/Mips/MipsLegalizerInfo.h +++ b/lib/Target/Mips/MipsLegalizerInfo.h @@ -28,6 +28,9 @@ public: bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const override; + + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; }; } // end namespace llvm #endif diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td index 907ed9ef746f..f585d9c1a148 100644 --- a/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/lib/Target/Mips/MipsMSAInstrInfo.td @@ -60,6 +60,11 @@ def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>; def immZExt3Ptr : ImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>; def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>; +def timmZExt1Ptr : TImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>; +def timmZExt2Ptr : TImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>; +def timmZExt3Ptr : TImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>; +def timmZExt4Ptr : TImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>; + // Operands def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>; @@ -1270,7 +1275,7 @@ class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD, dag OutOperandList = (outs ROWD:$wd); dag InOperandList = (ins ROWS:$ws, uimm8:$u8); string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8"); - list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))]; + list<dag> Pattern = [(set ROWD:$wd, (MipsSHF timmZExt8:$u8, ROWS:$ws))]; InstrItinClass Itinerary = itin; } @@ -2299,13 +2304,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC : class INSERT_FD_VIDX64_PSEUDO_DESC : MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>; -class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4, +class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, timmZExt4, MSA128BOpnd>; -class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3, +class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, timmZExt3, MSA128HOpnd>; -class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2, +class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, timmZExt2, MSA128WOpnd>; -class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1, +class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, timmZExt1, MSA128DOpnd>; class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -2518,22 +2523,22 @@ class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>; class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>; class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3, - immZExt3, MSA128BOpnd>; + timmZExt3, MSA128BOpnd>; class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4, - immZExt4, MSA128HOpnd>; + timmZExt4, MSA128HOpnd>; class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5, - immZExt5, MSA128WOpnd>; + timmZExt5, MSA128WOpnd>; class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6, - immZExt6, MSA128DOpnd>; + timmZExt6, MSA128DOpnd>; class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3, - immZExt3, MSA128BOpnd>; + timmZExt3, MSA128BOpnd>; class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4, - immZExt4, MSA128HOpnd>; + timmZExt4, MSA128HOpnd>; class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5, - immZExt5, MSA128WOpnd>; + timmZExt5, MSA128WOpnd>; class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6, - immZExt6, MSA128DOpnd>; + timmZExt6, MSA128DOpnd>; class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>; class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>; @@ -2546,16 +2551,16 @@ class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>; class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b, MSA128BOpnd, MSA128BOpnd, uimm4, - immZExt4>; + timmZExt4>; class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h, MSA128HOpnd, MSA128HOpnd, uimm3, - immZExt3>; + timmZExt3>; class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w, MSA128WOpnd, MSA128WOpnd, uimm2, - immZExt2>; + timmZExt2>; class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d, MSA128DOpnd, MSA128DOpnd, uimm1, - immZExt1>; + timmZExt1>; class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>; class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>; @@ -2609,13 +2614,13 @@ class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>; class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>; class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3, - immZExt3, MSA128BOpnd>; + timmZExt3, MSA128BOpnd>; class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4, - immZExt4, MSA128HOpnd>; + timmZExt4, MSA128HOpnd>; class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5, - immZExt5, MSA128WOpnd>; + timmZExt5, MSA128WOpnd>; class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6, - immZExt6, MSA128DOpnd>; + timmZExt6, MSA128DOpnd>; class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>; class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>; @@ -2637,13 +2642,13 @@ class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>; class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>; class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3, - immZExt3, MSA128BOpnd>; + timmZExt3, MSA128BOpnd>; class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4, - immZExt4, MSA128HOpnd>; + timmZExt4, MSA128HOpnd>; class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5, - immZExt5, MSA128WOpnd>; + timmZExt5, MSA128WOpnd>; class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6, - immZExt6, MSA128DOpnd>; + timmZExt6, MSA128DOpnd>; class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode, ValueType TyNode, RegisterOperand ROWD, diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index 5ef07a2d283e..8bd64ff6cb27 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -127,8 +127,7 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) { MachineOperand &MO = MI.getOperand(0); - if (!MO.isReg() || !MO.isUse() || - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (!MO.isReg() || !MO.isUse() || !Register::isVirtualRegister(MO.getReg())) return nullptr; return &MO; @@ -152,7 +151,7 @@ static void setCallTargetReg(MachineBasicBlock *MBB, MachineBasicBlock::iterator I) { MachineFunction &MF = *MBB->getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - unsigned SrcReg = I->getOperand(0).getReg(); + Register SrcReg = I->getOperand(0).getReg(); unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64; BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg) .addReg(SrcReg); diff --git a/lib/Target/Mips/MipsPfmCounters.td b/lib/Target/Mips/MipsPfmCounters.td new file mode 100644 index 000000000000..c7779b474b91 --- /dev/null +++ b/lib/Target/Mips/MipsPfmCounters.td @@ -0,0 +1,18 @@ +//===-- MipsPfmCounters.td - Mips Hardware Counters --------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for Mips. +// +//===----------------------------------------------------------------------===// + +def CpuCyclesPfmCounter : PfmCounter<"CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; +} +def : PfmCountersDefaultBinding<DefaultPfmCounters>; diff --git a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp index 85076590d407..ace0735652bd 100644 --- a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -27,7 +27,8 @@ class MipsPreLegalizerCombinerInfo : public CombinerInfo { public: MipsPreLegalizerCombinerInfo() : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false, + /*EnableOptSize*/ false, /*EnableMinSize*/ false) {} virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp index d8bcf16afd50..d334366e727c 100644 --- a/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -12,6 +12,7 @@ #include "MipsRegisterBankInfo.h" #include "MipsInstrInfo.h" +#include "MipsTargetMachine.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" @@ -27,20 +28,23 @@ enum PartialMappingIdx { PMI_GPR, PMI_SPR, PMI_DPR, + PMI_MSA, PMI_Min = PMI_GPR, }; RegisterBankInfo::PartialMapping PartMappings[]{ {0, 32, GPRBRegBank}, {0, 32, FPRBRegBank}, - {0, 64, FPRBRegBank} + {0, 64, FPRBRegBank}, + {0, 128, FPRBRegBank} }; enum ValueMappingIdx { InvalidIdx = 0, GPRIdx = 1, SPRIdx = 4, - DPRIdx = 7 + DPRIdx = 7, + MSAIdx = 10 }; RegisterBankInfo::ValueMapping ValueMappings[] = { @@ -50,14 +54,18 @@ RegisterBankInfo::ValueMapping ValueMappings[] = { {&PartMappings[PMI_GPR - PMI_Min], 1}, {&PartMappings[PMI_GPR - PMI_Min], 1}, {&PartMappings[PMI_GPR - PMI_Min], 1}, - // up to 3 ops operands FPRs - single precission + // up to 3 operands in FPRs - single precission {&PartMappings[PMI_SPR - PMI_Min], 1}, {&PartMappings[PMI_SPR - PMI_Min], 1}, {&PartMappings[PMI_SPR - PMI_Min], 1}, - // up to 3 ops operands FPRs - double precission + // up to 3 operands in FPRs - double precission {&PartMappings[PMI_DPR - PMI_Min], 1}, {&PartMappings[PMI_DPR - PMI_Min], 1}, - {&PartMappings[PMI_DPR - PMI_Min], 1} + {&PartMappings[PMI_DPR - PMI_Min], 1}, + // up to 3 operands in FPRs - MSA + {&PartMappings[PMI_MSA - PMI_Min], 1}, + {&PartMappings[PMI_MSA - PMI_Min], 1}, + {&PartMappings[PMI_MSA - PMI_Min], 1} }; } // end namespace Mips @@ -86,6 +94,10 @@ const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass( case Mips::FGR32RegClassID: case Mips::FGR64RegClassID: case Mips::AFGR64RegClassID: + case Mips::MSA128BRegClassID: + case Mips::MSA128HRegClassID: + case Mips::MSA128WRegClassID: + case Mips::MSA128DRegClassID: return getRegBank(Mips::FPRBRegBankID); default: llvm_unreachable("Register class not supported"); @@ -149,6 +161,7 @@ static bool isAmbiguous(unsigned Opc) { case TargetOpcode::G_STORE: case TargetOpcode::G_PHI: case TargetOpcode::G_SELECT: + case TargetOpcode::G_IMPLICIT_DEF: return true; default: return false; @@ -163,8 +176,7 @@ void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addDefUses( MachineInstr *NonCopyInstr = skipCopiesOutgoing(&UseMI); // Copy with many uses. if (NonCopyInstr->getOpcode() == TargetOpcode::COPY && - !TargetRegisterInfo::isPhysicalRegister( - NonCopyInstr->getOperand(0).getReg())) + !Register::isPhysicalRegister(NonCopyInstr->getOperand(0).getReg())) addDefUses(NonCopyInstr->getOperand(0).getReg(), MRI); else DefUses.push_back(skipCopiesOutgoing(&UseMI)); @@ -186,7 +198,7 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesOutgoing( const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineInstr *Ret = MI; while (Ret->getOpcode() == TargetOpcode::COPY && - !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(0).getReg()) && + !Register::isPhysicalRegister(Ret->getOperand(0).getReg()) && MRI.hasOneUse(Ret->getOperand(0).getReg())) { Ret = &(*MRI.use_instr_begin(Ret->getOperand(0).getReg())); } @@ -200,7 +212,7 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesIncoming( const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineInstr *Ret = MI; while (Ret->getOpcode() == TargetOpcode::COPY && - !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(1).getReg())) + !Register::isPhysicalRegister(Ret->getOperand(1).getReg())) Ret = MRI.getVRegDef(Ret->getOperand(1).getReg()); return Ret; } @@ -231,6 +243,9 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::AmbiguousRegDefUseContainer( addUseDef(MI->getOperand(2).getReg(), MRI); addUseDef(MI->getOperand(3).getReg(), MRI); } + + if (MI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF) + addDefUses(MI->getOperand(0).getReg(), MRI); } bool MipsRegisterBankInfo::TypeInfoForMF::visit( @@ -318,8 +333,7 @@ void MipsRegisterBankInfo::TypeInfoForMF::setTypes(const MachineInstr *MI, void MipsRegisterBankInfo::TypeInfoForMF::setTypesAccordingToPhysicalRegister( const MachineInstr *MI, const MachineInstr *CopyInst, unsigned Op) { - assert((TargetRegisterInfo::isPhysicalRegister( - CopyInst->getOperand(Op).getReg())) && + assert((Register::isPhysicalRegister(CopyInst->getOperand(Op).getReg())) && "Copies of non physical registers should not be considered here.\n"); const MachineFunction &MF = *CopyInst->getMF(); @@ -353,6 +367,31 @@ void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction( } } +static const MipsRegisterBankInfo::ValueMapping * +getMSAMapping(const MachineFunction &MF) { + assert(static_cast<const MipsSubtarget &>(MF.getSubtarget()).hasMSA() && + "MSA mapping not available on target without MSA."); + return &Mips::ValueMappings[Mips::MSAIdx]; +} + +static const MipsRegisterBankInfo::ValueMapping *getFprbMapping(unsigned Size) { + return Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] + : &Mips::ValueMappings[Mips::DPRIdx]; +} + +static const unsigned CustomMappingID = 1; + +// Only 64 bit mapping is available in fprb and will be marked as custom, i.e. +// will be split into two 32 bit registers in gprb. +static const MipsRegisterBankInfo::ValueMapping * +getGprbOrCustomMapping(unsigned Size, unsigned &MappingID) { + if (Size == 32) + return &Mips::ValueMappings[Mips::GPRIdx]; + + MappingID = CustomMappingID; + return &Mips::ValueMappings[Mips::DPRIdx]; +} + const RegisterBankInfo::InstructionMapping & MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { @@ -377,17 +416,35 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned NumOperands = MI.getNumOperands(); const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx]; unsigned MappingID = DefaultMappingID; - const unsigned CustomMappingID = 1; + + // Check if LLT sizes match sizes of available register banks. + for (const MachineOperand &Op : MI.operands()) { + if (Op.isReg()) { + LLT RegTy = MRI.getType(Op.getReg()); + + if (RegTy.isScalar() && + (RegTy.getSizeInBits() != 32 && RegTy.getSizeInBits() != 64)) + return getInvalidInstructionMapping(); + + if (RegTy.isVector() && RegTy.getSizeInBits() != 128) + return getInvalidInstructionMapping(); + } + } + + const LLT Op0Ty = MRI.getType(MI.getOperand(0).getReg()); + unsigned Op0Size = Op0Ty.getSizeInBits(); + InstType InstTy = InstType::Integer; switch (Opc) { case G_TRUNC: - case G_ADD: case G_SUB: case G_MUL: case G_UMULH: case G_ZEXTLOAD: case G_SEXTLOAD: case G_GEP: + case G_INTTOPTR: + case G_PTRTOINT: case G_AND: case G_OR: case G_XOR: @@ -398,66 +455,42 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_UDIV: case G_SREM: case G_UREM: + case G_BRINDIRECT: + case G_VASTART: OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx]; break; - case G_LOAD: { - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - InstType InstTy = InstType::Integer; - if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { - InstTy = TI.determineInstType(&MI); - } - - if (InstTy == InstType::FloatingPoint || - (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb - OperandsMapping = - getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx], - &Mips::ValueMappings[Mips::GPRIdx]}); + case G_ADD: + OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx]; + if (Op0Size == 128) + OperandsMapping = getMSAMapping(MF); + break; + case G_STORE: + case G_LOAD: + if (Op0Size == 128) { + OperandsMapping = getOperandsMapping( + {getMSAMapping(MF), &Mips::ValueMappings[Mips::GPRIdx]}); break; - } else { // gprb - OperandsMapping = - getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx], - &Mips::ValueMappings[Mips::GPRIdx]}); - if (Size == 64) - MappingID = CustomMappingID; } - break; - } - case G_STORE: { - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - InstType InstTy = InstType::Integer; - if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + if (!Op0Ty.isPointer()) InstTy = TI.determineInstType(&MI); - } if (InstTy == InstType::FloatingPoint || - (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb - OperandsMapping = - getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx], - &Mips::ValueMappings[Mips::GPRIdx]}); - break; - } else { // gprb + (Op0Size == 64 && InstTy == InstType::Ambiguous)) + OperandsMapping = getOperandsMapping( + {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]}); + else OperandsMapping = - getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx], + getOperandsMapping({getGprbOrCustomMapping(Op0Size, MappingID), &Mips::ValueMappings[Mips::GPRIdx]}); - if (Size == 64) - MappingID = CustomMappingID; - } + break; - } - case G_PHI: { - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - InstType InstTy = InstType::Integer; - if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + case G_PHI: + if (!Op0Ty.isPointer()) InstTy = TI.determineInstType(&MI); - } // PHI is copylike and should have one regbank in mapping for def register. - if (InstTy == InstType::Integer && Size == 64) { // fprb + if (InstTy == InstType::Integer && Op0Size == 64) { OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx]}); return getInstructionMapping(CustomMappingID, /*Cost=*/1, OperandsMapping, @@ -465,80 +498,63 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } // Use default handling for PHI, i.e. set reg bank of def operand to match // register banks of use operands. - const RegisterBankInfo::InstructionMapping &Mapping = - getInstrMappingImpl(MI); - return Mapping; - } + return getInstrMappingImpl(MI); case G_SELECT: { - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - InstType InstTy = InstType::Integer; - if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) { + if (!Op0Ty.isPointer()) InstTy = TI.determineInstType(&MI); - } if (InstTy == InstType::FloatingPoint || - (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb - const RegisterBankInfo::ValueMapping *Bank = - Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx]; + (Op0Size == 64 && InstTy == InstType::Ambiguous)) { + const RegisterBankInfo::ValueMapping *Bank = getFprbMapping(Op0Size); OperandsMapping = getOperandsMapping( {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank}); break; - } else { // gprb + } else { const RegisterBankInfo::ValueMapping *Bank = - Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx]; + getGprbOrCustomMapping(Op0Size, MappingID); OperandsMapping = getOperandsMapping( {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank}); - if (Size == 64) - MappingID = CustomMappingID; } break; } - case G_UNMERGE_VALUES: { + case G_IMPLICIT_DEF: + if (!Op0Ty.isPointer()) + InstTy = TI.determineInstType(&MI); + + if (InstTy == InstType::FloatingPoint) + OperandsMapping = getFprbMapping(Op0Size); + else + OperandsMapping = getGprbOrCustomMapping(Op0Size, MappingID); + + break; + case G_UNMERGE_VALUES: OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], &Mips::ValueMappings[Mips::GPRIdx], &Mips::ValueMappings[Mips::DPRIdx]}); MappingID = CustomMappingID; break; - } - case G_MERGE_VALUES: { + case G_MERGE_VALUES: OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx], &Mips::ValueMappings[Mips::GPRIdx], &Mips::ValueMappings[Mips::GPRIdx]}); MappingID = CustomMappingID; break; - } case G_FADD: case G_FSUB: case G_FMUL: case G_FDIV: case G_FABS: - case G_FSQRT:{ - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - assert((Size == 32 || Size == 64) && "Unsupported floating point size"); - OperandsMapping = Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx]; + case G_FSQRT: + OperandsMapping = getFprbMapping(Op0Size); break; - } - case G_FCONSTANT: { - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - assert((Size == 32 || Size == 64) && "Unsupported floating point size"); - const RegisterBankInfo::ValueMapping *FPRValueMapping = - Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx]; - OperandsMapping = getOperandsMapping({FPRValueMapping, nullptr}); + case G_FCONSTANT: + OperandsMapping = getOperandsMapping({getFprbMapping(Op0Size), nullptr}); break; - } case G_FCMP: { - unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); - assert((Size == 32 || Size == 64) && "Unsupported floating point size"); - const RegisterBankInfo::ValueMapping *FPRValueMapping = - Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx]; + unsigned Op2Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr, - FPRValueMapping, FPRValueMapping}); + getFprbMapping(Op2Size), getFprbMapping(Op2Size)}); break; } case G_FPEXT: @@ -550,36 +566,31 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { &Mips::ValueMappings[Mips::DPRIdx]}); break; case G_FPTOSI: { + assert((Op0Size == 32) && "Unsupported integer size"); unsigned SizeFP = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); - assert((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 32) && - "Unsupported integer size"); - assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size"); - OperandsMapping = getOperandsMapping({ - &Mips::ValueMappings[Mips::GPRIdx], - SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx], - }); + OperandsMapping = getOperandsMapping( + {&Mips::ValueMappings[Mips::GPRIdx], getFprbMapping(SizeFP)}); break; } - case G_SITOFP: { - unsigned SizeInt = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); - unsigned SizeFP = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - (void)SizeInt; - assert((SizeInt == 32) && "Unsupported integer size"); - assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size"); - OperandsMapping = - getOperandsMapping({SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx] - : &Mips::ValueMappings[Mips::DPRIdx], - &Mips::ValueMappings[Mips::GPRIdx]}); + case G_SITOFP: + assert((MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() == 32) && + "Unsupported integer size"); + OperandsMapping = getOperandsMapping( + {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]}); break; - } case G_CONSTANT: case G_FRAME_INDEX: case G_GLOBAL_VALUE: + case G_JUMP_TABLE: case G_BRCOND: OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr}); break; + case G_BRJT: + OperandsMapping = + getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr, + &Mips::ValueMappings[Mips::GPRIdx]}); + break; case G_ICMP: OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr, @@ -609,11 +620,41 @@ public: }; } // end anonymous namespace -/// Here we have to narrowScalar s64 operands to s32, combine away -/// G_MERGE/G_UNMERGE and erase instructions that became dead in the process. -/// We manually assign 32 bit gprb to register operands of all new instructions -/// that got created in the process since they will not end up in RegBankSelect -/// loop. Careful not to delete instruction after MI i.e. MI.getIterator()++. +void MipsRegisterBankInfo::setRegBank(MachineInstr &MI, + MachineRegisterInfo &MRI) const { + Register Dest = MI.getOperand(0).getReg(); + switch (MI.getOpcode()) { + case TargetOpcode::G_STORE: + // No def operands, skip this instruction. + break; + case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_LOAD: + case TargetOpcode::G_SELECT: + case TargetOpcode::G_PHI: + case TargetOpcode::G_IMPLICIT_DEF: { + assert(MRI.getType(Dest) == LLT::scalar(32) && "Unexpected operand type."); + MRI.setRegBank(Dest, getRegBank(Mips::GPRBRegBankID)); + break; + } + case TargetOpcode::G_GEP: { + assert(MRI.getType(Dest).isPointer() && "Unexpected operand type."); + MRI.setRegBank(Dest, getRegBank(Mips::GPRBRegBankID)); + break; + } + default: + llvm_unreachable("Unexpected opcode."); + } +} + +static void +combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner, + MachineInstr &MI) { + SmallVector<MachineInstr *, 2> DeadInstrs; + ArtCombiner.tryCombineMerges(MI, DeadInstrs); + for (MachineInstr *DeadMI : DeadInstrs) + DeadMI->eraseFromParent(); +} + void MipsRegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -621,18 +662,19 @@ void MipsRegisterBankInfo::applyMappingImpl( MachineIRBuilder B(MI); MachineFunction *MF = MI.getMF(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); + const LegalizerInfo &LegInfo = *MF->getSubtarget().getLegalizerInfo(); InstManager NewInstrObserver(NewInstrs); GISelObserverWrapper WrapperObserver(&NewInstrObserver); LegalizerHelper Helper(*MF, WrapperObserver, B); - LegalizationArtifactCombiner ArtCombiner( - B, MF->getRegInfo(), *MF->getSubtarget().getLegalizerInfo()); + LegalizationArtifactCombiner ArtCombiner(B, MF->getRegInfo(), LegInfo); switch (MI.getOpcode()) { case TargetOpcode::G_LOAD: case TargetOpcode::G_STORE: case TargetOpcode::G_PHI: - case TargetOpcode::G_SELECT: { + case TargetOpcode::G_SELECT: + case TargetOpcode::G_IMPLICIT_DEF: { Helper.narrowScalar(MI, 0, LLT::scalar(32)); // Handle new instructions. while (!NewInstrs.empty()) { @@ -640,35 +682,21 @@ void MipsRegisterBankInfo::applyMappingImpl( // This is new G_UNMERGE that was created during narrowScalar and will // not be considered for regbank selection. RegBankSelect for mips // visits/makes corresponding G_MERGE first. Combine them here. - if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { - SmallVector<MachineInstr *, 2> DeadInstrs; - ArtCombiner.tryCombineMerges(*NewMI, DeadInstrs); - for (MachineInstr *DeadMI : DeadInstrs) - DeadMI->eraseFromParent(); - } + if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) + combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI); // This G_MERGE will be combined away when its corresponding G_UNMERGE // gets regBankSelected. else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES) continue; else - // Manually set register banks for all register operands to 32 bit gprb. - for (auto Op : NewMI->operands()) { - if (Op.isReg()) { - assert(MRI.getType(Op.getReg()).getSizeInBits() == 32 && - "Only 32 bit gprb is handled here.\n"); - MRI.setRegBank(Op.getReg(), getRegBank(Mips::GPRBRegBankID)); - } - } + // Manually set register banks for def operands to 32 bit gprb. + setRegBank(*NewMI, MRI); } return; } - case TargetOpcode::G_UNMERGE_VALUES: { - SmallVector<MachineInstr *, 2> DeadInstrs; - ArtCombiner.tryCombineMerges(MI, DeadInstrs); - for (MachineInstr *DeadMI : DeadInstrs) - DeadMI->eraseFromParent(); + case TargetOpcode::G_UNMERGE_VALUES: + combineAwayG_UNMERGE_VALUES(ArtCombiner, MI); return; - } default: break; } diff --git a/lib/Target/Mips/MipsRegisterBankInfo.h b/lib/Target/Mips/MipsRegisterBankInfo.h index 176813c031ed..fa0f1c7bc941 100644 --- a/lib/Target/Mips/MipsRegisterBankInfo.h +++ b/lib/Target/Mips/MipsRegisterBankInfo.h @@ -38,8 +38,17 @@ public: const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override; + /// Here we have to narrowScalar s64 operands to s32, combine away G_MERGE or + /// G_UNMERGE and erase instructions that became dead in the process. We + /// manually assign bank to def operand of all new instructions that were + /// created in the process since they will not end up in RegBankSelect loop. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + /// RegBankSelect determined that s64 operand is better to be split into two + /// s32 operands in gprb. Here we manually set register banks of def operands + /// of newly created instructions since they will not get regbankselected. + void setRegBank(MachineInstr &MI, MachineRegisterInfo &MRI) const; + private: /// Some instructions are used with both floating point and integer operands. /// We assign InstType to such instructions as it helps us to avoid cross bank diff --git a/lib/Target/Mips/MipsRegisterBanks.td b/lib/Target/Mips/MipsRegisterBanks.td index 14a0181f8f11..7d11475884ce 100644 --- a/lib/Target/Mips/MipsRegisterBanks.td +++ b/lib/Target/Mips/MipsRegisterBanks.td @@ -11,4 +11,4 @@ def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>; -def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64]>; +def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64, MSA128D]>; diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index 4c6cc1ef771c..166ddea0431f 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -171,8 +171,8 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) { assert(I->getOperand(0).isReg() && I->getOperand(1).isFI()); const TargetRegisterClass *RC = RegInfo.intRegClass(4); - unsigned VR = MRI.createVirtualRegister(RC); - unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); + Register VR = MRI.createVirtualRegister(RC); + Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0); BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst) @@ -186,8 +186,8 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) { assert(I->getOperand(0).isReg() && I->getOperand(1).isFI()); const TargetRegisterClass *RC = RegInfo.intRegClass(4); - unsigned VR = MRI.createVirtualRegister(RC); - unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); + Register VR = MRI.createVirtualRegister(RC); + Register Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR) .addReg(Src, getKillRegState(I->getOperand(0).isKill())); @@ -204,11 +204,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I, assert(I->getOperand(0).isReg() && I->getOperand(1).isFI()); const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize); - unsigned VR0 = MRI.createVirtualRegister(RC); - unsigned VR1 = MRI.createVirtualRegister(RC); - unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); - unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo); - unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi); + Register VR0 = MRI.createVirtualRegister(RC); + Register VR1 = MRI.createVirtualRegister(RC); + Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); + Register Lo = RegInfo.getSubReg(Dst, Mips::sub_lo); + Register Hi = RegInfo.getSubReg(Dst, Mips::sub_hi); DebugLoc DL = I->getDebugLoc(); const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); @@ -229,9 +229,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I, assert(I->getOperand(0).isReg() && I->getOperand(1).isFI()); const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize); - unsigned VR0 = MRI.createVirtualRegister(RC); - unsigned VR1 = MRI.createVirtualRegister(RC); - unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); + Register VR0 = MRI.createVirtualRegister(RC); + Register VR1 = MRI.createVirtualRegister(RC); + Register Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); unsigned SrcKill = getKillRegState(I->getOperand(0).isKill()); DebugLoc DL = I->getDebugLoc(); @@ -242,7 +242,7 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I, } bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) { - unsigned Src = I->getOperand(1).getReg(); + Register Src = I->getOperand(1).getReg(); std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src); if (!Opcodes.first) @@ -262,11 +262,11 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst); unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16; const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize); - unsigned VR0 = MRI.createVirtualRegister(RC); - unsigned VR1 = MRI.createVirtualRegister(RC); + Register VR0 = MRI.createVirtualRegister(RC); + Register VR1 = MRI.createVirtualRegister(RC); unsigned SrcKill = getKillRegState(I->getOperand(1).isKill()); - unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo); - unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi); + Register DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo); + Register DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi); DebugLoc DL = I->getDebugLoc(); BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src); @@ -304,9 +304,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, // stack is used. if (I->getNumOperands() == 4 && I->getOperand(3).isReg() && I->getOperand(3).getReg() == Mips::SP) { - unsigned DstReg = I->getOperand(0).getReg(); - unsigned LoReg = I->getOperand(1).getReg(); - unsigned HiReg = I->getOperand(2).getReg(); + Register DstReg = I->getOperand(0).getReg(); + Register LoReg = I->getOperand(1).getReg(); + Register HiReg = I->getOperand(2).getReg(); // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are // the cases where mthc1 is not available). 64-bit architectures and @@ -346,7 +346,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, const MachineOperand &Op2 = I->getOperand(2); if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) { - unsigned DstReg = I->getOperand(0).getReg(); + Register DstReg = I->getOperand(0).getReg(); BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg); return true; } @@ -369,8 +369,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, // stack is used. if (I->getNumOperands() == 4 && I->getOperand(3).isReg() && I->getOperand(3).getReg() == Mips::SP) { - unsigned DstReg = I->getOperand(0).getReg(); - unsigned SrcReg = Op1.getReg(); + Register DstReg = I->getOperand(0).getReg(); + Register SrcReg = Op1.getReg(); unsigned N = Op2.getImm(); int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N)); @@ -538,7 +538,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, if (RegInfo.needsStackRealignment(MF)) { // addiu $Reg, $zero, -MaxAlignment // andi $sp, $sp, $Reg - unsigned VR = MF.getRegInfo().createVirtualRegister(RC); + Register VR = MF.getRegInfo().createVirtualRegister(RC); assert(isInt<16>(MFI.getMaxAlignment()) && "Function's alignment size requirement is not supported."); int MaxAlign = -(int)MFI.getMaxAlignment(); @@ -865,12 +865,15 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); MipsABIInfo ABI = STI.getABI(); + unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA; unsigned FP = ABI.GetFramePtr(); unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7; - // Mark $fp as used if function has dedicated frame pointer. - if (hasFP(MF)) + // Mark $ra and $fp as used if function has dedicated frame pointer. + if (hasFP(MF)) { + setAliasRegs(MF, SavedRegs, RA); setAliasRegs(MF, SavedRegs, FP); + } // Mark $s7 as used if function has dedicated base pointer. if (hasBP(MF)) setAliasRegs(MF, SavedRegs, BP); diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 703f99f37dd1..c8313240a678 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -124,6 +124,33 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI, return true; } +void MipsSEDAGToDAGISel::emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB, + MachineFunction &MF) { + MachineInstrBuilder MIB(MF, &MI); + if (!Subtarget->isABI_O32()) { // N32, N64 + // Save current return address. + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::OR64)) + .addDef(Mips::AT_64) + .addUse(Mips::RA_64, RegState::Undef) + .addUse(Mips::ZERO_64); + // Stops instruction above from being removed later on. + MIB.addUse(Mips::AT_64, RegState::Implicit); + } else { // O32 + // Save current return address. + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::OR)) + .addDef(Mips::AT) + .addUse(Mips::RA, RegState::Undef) + .addUse(Mips::ZERO); + // _mcount pops 2 words from stack. + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::ADDiu)) + .addDef(Mips::SP) + .addUse(Mips::SP) + .addImm(-8); + // Stops first instruction above from being removed later on. + MIB.addUse(Mips::AT, RegState::Implicit); + } +} + void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { MF.getInfo<MipsFunctionInfo>()->initGlobalBaseReg(); @@ -150,6 +177,24 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { if (Subtarget->isABI_FPXX() && !Subtarget->hasMTHC1()) MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true)); break; + case Mips::JAL: + case Mips::JAL_MM: + if (MI.getOperand(0).isGlobal() && + MI.getOperand(0).getGlobal()->getGlobalIdentifier() == "_mcount") + emitMCountABI(MI, MBB, MF); + break; + case Mips::JALRPseudo: + case Mips::JALR64Pseudo: + case Mips::JALR16_MM: + if (MI.getOperand(2).isMCSymbol() && + MI.getOperand(2).getMCSymbol()->getName() == "_mcount") + emitMCountABI(MI, MBB, MF); + break; + case Mips::JALR: + if (MI.getOperand(3).isMCSymbol() && + MI.getOperand(3).getMCSymbol()->getName() == "_mcount") + emitMCountABI(MI, MBB, MF); + break; default: replaceUsesWithZeroReg(MRI, MI); } @@ -247,7 +292,8 @@ bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset( Base = Addr.getOperand(0); // If base is a FI, additional offset calculation is done in // eliminateFrameIndex, otherwise we need to check the alignment - if (OffsetToAlignment(CN->getZExtValue(), 1ull << ShiftAmount) != 0) + const Align Alignment(1ULL << ShiftAmount); + if (!isAligned(Alignment, CN->getZExtValue())) return false; } @@ -719,7 +765,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { } case ISD::ConstantFP: { - ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node); + auto *CN = cast<ConstantFPSDNode>(Node); if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { if (Subtarget->isGP64bit()) { SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, @@ -743,7 +789,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { } case ISD::Constant: { - const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node); + auto *CN = cast<ConstantSDNode>(Node); int64_t Imm = CN->getSExtValue(); unsigned Size = CN->getValueSizeInBits(0); @@ -969,7 +1015,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { break; } - SDNode *Res; + SDNode *Res = nullptr; // If we have a signed 10 bit integer, we can splat it directly. // diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h index ce594e1fb4fa..39f665be571e 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -120,7 +120,7 @@ private: /// power of 2. bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override; /// Select constant vector splats whose value is a run of set bits - /// ending at the most significant bit + /// ending at the most significant bit. bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override; /// Select constant vector splats whose value is a run of set bits /// starting at bit zero. @@ -128,6 +128,10 @@ private: bool trySelect(SDNode *Node) override; + // Emits proper ABI for _mcount profiling calls. + void emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB, + MachineFunction &MF); + void processFunctionAfterISel(MachineFunction &MF) override; bool SelectInlineAsmMemoryOperand(const SDValue &Op, diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index edf57a3840d1..5bd234f955ba 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -71,8 +71,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, if (Subtarget.hasDSP() || Subtarget.hasMSA()) { // Expand all truncating stores and extending loads. - for (MVT VT0 : MVT::vector_valuetypes()) { - for (MVT VT1 : MVT::vector_valuetypes()) { + for (MVT VT0 : MVT::fixedlen_vector_valuetypes()) { + for (MVT VT1 : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT0, VT1, Expand); setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand); @@ -327,6 +327,7 @@ addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) { setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal); setOperationAction(ISD::BUILD_VECTOR, Ty, Custom); + setOperationAction(ISD::UNDEF, Ty, Legal); setOperationAction(ISD::ADD, Ty, Legal); setOperationAction(ISD::AND, Ty, Legal); @@ -2595,7 +2596,8 @@ static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy, SDLoc DL(Op); return DAG.getNode(MipsISD::SHF, DL, ResTy, - DAG.getConstant(Imm, DL, MVT::i32), Op->getOperand(0)); + DAG.getTargetConstant(Imm, DL, MVT::i32), + Op->getOperand(0)); } /// Determine whether a range fits a regular pattern of values. @@ -3062,13 +3064,13 @@ MipsSETargetLowering::emitBPOSGE32(MachineInstr &MI, BuildMI(BB, DL, TII->get(Mips::BPOSGE32C_MMR3)).addMBB(TBB); // Fill $FBB. - unsigned VR2 = RegInfo.createVirtualRegister(RC); + Register VR2 = RegInfo.createVirtualRegister(RC); BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2) .addReg(Mips::ZERO).addImm(0); BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink); // Fill $TBB. - unsigned VR1 = RegInfo.createVirtualRegister(RC); + Register VR1 = RegInfo.createVirtualRegister(RC); BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1) .addReg(Mips::ZERO).addImm(1); @@ -3131,13 +3133,13 @@ MachineBasicBlock *MipsSETargetLowering::emitMSACBranchPseudo( .addMBB(TBB); // Fill $FBB. - unsigned RD1 = RegInfo.createVirtualRegister(RC); + Register RD1 = RegInfo.createVirtualRegister(RC); BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1) .addReg(Mips::ZERO).addImm(0); BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink); // Fill $TBB. - unsigned RD2 = RegInfo.createVirtualRegister(RC); + Register RD2 = RegInfo.createVirtualRegister(RC); BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2) .addReg(Mips::ZERO).addImm(1); @@ -3169,8 +3171,8 @@ MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Fd = MI.getOperand(0).getReg(); - unsigned Ws = MI.getOperand(1).getReg(); + Register Fd = MI.getOperand(0).getReg(); + Register Ws = MI.getOperand(1).getReg(); unsigned Lane = MI.getOperand(2).getImm(); if (Lane == 0) { @@ -3185,9 +3187,9 @@ MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI, BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo); } else { - unsigned Wt = RegInfo.createVirtualRegister( - Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass : - &Mips::MSA128WEvensRegClass); + Register Wt = RegInfo.createVirtualRegister( + Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass + : &Mips::MSA128WEvensRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane); BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo); @@ -3214,15 +3216,15 @@ MipsSETargetLowering::emitCOPY_FD(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); - unsigned Fd = MI.getOperand(0).getReg(); - unsigned Ws = MI.getOperand(1).getReg(); + Register Fd = MI.getOperand(0).getReg(); + Register Ws = MI.getOperand(1).getReg(); unsigned Lane = MI.getOperand(2).getImm() * 2; DebugLoc DL = MI.getDebugLoc(); if (Lane == 0) BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64); else { - unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); + Register Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1); BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64); @@ -3244,13 +3246,13 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); - unsigned Wd_in = MI.getOperand(1).getReg(); + Register Wd = MI.getOperand(0).getReg(); + Register Wd_in = MI.getOperand(1).getReg(); unsigned Lane = MI.getOperand(2).getImm(); - unsigned Fs = MI.getOperand(3).getReg(); - unsigned Wt = RegInfo.createVirtualRegister( - Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass : - &Mips::MSA128WEvensRegClass); + Register Fs = MI.getOperand(3).getReg(); + Register Wt = RegInfo.createVirtualRegister( + Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass + : &Mips::MSA128WEvensRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt) .addImm(0) @@ -3280,11 +3282,11 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); - unsigned Wd_in = MI.getOperand(1).getReg(); + Register Wd = MI.getOperand(0).getReg(); + Register Wd_in = MI.getOperand(1).getReg(); unsigned Lane = MI.getOperand(2).getImm(); - unsigned Fs = MI.getOperand(3).getReg(); - unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); + Register Fs = MI.getOperand(3).getReg(); + Register Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt) .addImm(0) @@ -3326,10 +3328,10 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX( const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); - unsigned SrcVecReg = MI.getOperand(1).getReg(); - unsigned LaneReg = MI.getOperand(2).getReg(); - unsigned SrcValReg = MI.getOperand(3).getReg(); + Register Wd = MI.getOperand(0).getReg(); + Register SrcVecReg = MI.getOperand(1).getReg(); + Register LaneReg = MI.getOperand(2).getReg(); + Register SrcValReg = MI.getOperand(3).getReg(); const TargetRegisterClass *VecRC = nullptr; // FIXME: This should be true for N32 too. @@ -3370,7 +3372,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX( } if (IsFP) { - unsigned Wt = RegInfo.createVirtualRegister(VecRC); + Register Wt = RegInfo.createVirtualRegister(VecRC); BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt) .addImm(0) .addReg(SrcValReg) @@ -3380,7 +3382,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX( // Convert the lane index into a byte index if (EltSizeInBytes != 1) { - unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC); + Register LaneTmp1 = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(ShiftOp), LaneTmp1) .addReg(LaneReg) .addImm(EltLog2Size); @@ -3388,13 +3390,13 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX( } // Rotate bytes around so that the desired lane is element zero - unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC); + Register WdTmp1 = RegInfo.createVirtualRegister(VecRC); BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1) .addReg(SrcVecReg) .addReg(SrcVecReg) .addReg(LaneReg, 0, SubRegIdx); - unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC); + Register WdTmp2 = RegInfo.createVirtualRegister(VecRC); if (IsFP) { // Use insve.df to insert to element zero BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2) @@ -3413,7 +3415,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX( // Rotate elements the rest of the way for a full rotation. // sld.df inteprets $rt modulo the number of columns so we only need to negate // the lane index to do this. - unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC); + Register LaneTmp2 = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(Subtarget.isABI_N64() ? Mips::DSUB : Mips::SUB), LaneTmp2) .addReg(Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO) @@ -3440,12 +3442,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); - unsigned Fs = MI.getOperand(1).getReg(); - unsigned Wt1 = RegInfo.createVirtualRegister( + Register Wd = MI.getOperand(0).getReg(); + Register Fs = MI.getOperand(1).getReg(); + Register Wt1 = RegInfo.createVirtualRegister( Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass : &Mips::MSA128WEvensRegClass); - unsigned Wt2 = RegInfo.createVirtualRegister( + Register Wt2 = RegInfo.createVirtualRegister( Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass : &Mips::MSA128WEvensRegClass); @@ -3475,10 +3477,10 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); - unsigned Fs = MI.getOperand(1).getReg(); - unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); - unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); + Register Wd = MI.getOperand(0).getReg(); + Register Fs = MI.getOperand(1).getReg(); + Register Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); + Register Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1); BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2) @@ -3509,8 +3511,8 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Ws = MI.getOperand(0).getReg(); - unsigned Rt = MI.getOperand(1).getReg(); + Register Ws = MI.getOperand(0).getReg(); + Register Rt = MI.getOperand(1).getReg(); const MachineMemOperand &MMO = **MI.memoperands_begin(); unsigned Imm = MMO.getOffset(); @@ -3522,11 +3524,11 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI, : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass : &Mips::GPR64RegClass); const bool UsingMips32 = RC == &Mips::GPR32RegClass; - unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); + Register Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0); if(!UsingMips32) { - unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass); + Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp) .addImm(0) .addReg(Rs) @@ -3564,7 +3566,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); + Register Wd = MI.getOperand(0).getReg(); // Caution: A load via the GOT can expand to a GPR32 operand, a load via // spill and reload can expand as a GPR64 operand. Examine the @@ -3575,7 +3577,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI, : &Mips::GPR64RegClass); const bool UsingMips32 = RC == &Mips::GPR32RegClass; - unsigned Rt = RegInfo.createVirtualRegister(RC); + Register Rt = RegInfo.createVirtualRegister(RC); MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt); @@ -3583,7 +3585,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI, MIB.add(MI.getOperand(i)); if(!UsingMips32) { - unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); + Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32); Rt = Tmp; } @@ -3658,11 +3660,11 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Wd = MI.getOperand(0).getReg(); - unsigned Fs = MI.getOperand(1).getReg(); + Register Wd = MI.getOperand(0).getReg(); + Register Fs = MI.getOperand(1).getReg(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); - unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + Register Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); const TargetRegisterClass *GPRRC = IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; unsigned MFC1Opc = IsFGR64onMips64 @@ -3671,16 +3673,16 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W; // Perform the register class copy as mentioned above. - unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC); + Register Rtemp = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(MFC1Opc), Rtemp).addReg(Fs); BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp); unsigned WPHI = Wtemp; if (IsFGR64onMips32) { - unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC); + Register Rtemp2 = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs); - unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); - unsigned Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + Register Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + Register Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp2) .addReg(Wtemp) .addReg(Rtemp2) @@ -3693,7 +3695,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, } if (IsFGR64) { - unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + Register Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_W), Wtemp2) .addReg(WPHI) .addReg(WPHI); @@ -3817,8 +3819,8 @@ MipsSETargetLowering::emitFEXP2_W_1(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); const TargetRegisterClass *RC = &Mips::MSA128WRegClass; - unsigned Ws1 = RegInfo.createVirtualRegister(RC); - unsigned Ws2 = RegInfo.createVirtualRegister(RC); + Register Ws1 = RegInfo.createVirtualRegister(RC); + Register Ws2 = RegInfo.createVirtualRegister(RC); DebugLoc DL = MI.getDebugLoc(); // Splat 1.0 into a vector @@ -3846,8 +3848,8 @@ MipsSETargetLowering::emitFEXP2_D_1(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); const TargetRegisterClass *RC = &Mips::MSA128DRegClass; - unsigned Ws1 = RegInfo.createVirtualRegister(RC); - unsigned Ws2 = RegInfo.createVirtualRegister(RC); + Register Ws1 = RegInfo.createVirtualRegister(RC); + Register Ws2 = RegInfo.createVirtualRegister(RC); DebugLoc DL = MI.getDebugLoc(); // Splat 1.0 into a vector diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index 4e49f5e7d9d1..2126a1bda493 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -628,7 +628,7 @@ unsigned MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB, // The first instruction can be a LUi, which is different from other // instructions (ADDiu, ORI and SLL) in that it does not have a register // operand. - unsigned Reg = RegInfo.createVirtualRegister(RC); + Register Reg = RegInfo.createVirtualRegister(RC); if (Inst->Opc == LUi) BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd)); @@ -734,9 +734,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB, // Add lo/hi registers if the mtlo/hi instructions created have explicit // def registers. if (HasExplicitDef) { - unsigned DstReg = I->getOperand(0).getReg(); - unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo); - unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi); + Register DstReg = I->getOperand(0).getReg(); + Register DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo); + Register DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi); LoInst.addReg(DstLo, RegState::Define); HiInst.addReg(DstHi, RegState::Define); } @@ -773,14 +773,14 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, bool isMicroMips, bool FP64) const { - unsigned DstReg = I->getOperand(0).getReg(); - unsigned SrcReg = I->getOperand(1).getReg(); + Register DstReg = I->getOperand(0).getReg(); + Register SrcReg = I->getOperand(1).getReg(); unsigned N = I->getOperand(2).getImm(); DebugLoc dl = I->getDebugLoc(); assert(N < 2 && "Invalid immediate"); unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo; - unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx); + Register SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx); // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload // in MipsSEFrameLowering.cpp. @@ -815,7 +815,7 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB, void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, bool isMicroMips, bool FP64) const { - unsigned DstReg = I->getOperand(0).getReg(); + Register DstReg = I->getOperand(0).getReg(); unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg(); const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1); DebugLoc dl = I->getDebugLoc(); @@ -883,8 +883,8 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB, unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA; unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9; unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; - unsigned OffsetReg = I->getOperand(0).getReg(); - unsigned TargetReg = I->getOperand(1).getReg(); + Register OffsetReg = I->getOperand(0).getReg(); + Register TargetReg = I->getOperand(1).getReg(); // addu $ra, $v0, $zero // addu $sp, $sp, $v1 diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp index f4b164d5c0ab..a48088c28919 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -212,11 +212,9 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, // element size), otherwise it is a 16-bit signed immediate. unsigned OffsetBitSize = getLoadStoreOffsetSizeInBits(MI.getOpcode(), MI.getOperand(OpNo - 1)); - unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode()); - + const Align OffsetAlign(getLoadStoreOffsetAlign(MI.getOpcode())); if (OffsetBitSize < 16 && isInt<16>(Offset) && - (!isIntN(OffsetBitSize, Offset) || - OffsetToAlignment(Offset, OffsetAlign) != 0)) { + (!isIntN(OffsetBitSize, Offset) || !isAligned(OffsetAlign, Offset))) { // If we have an offset that needs to fit into a signed n-bit immediate // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu MachineBasicBlock &MBB = *MI.getParent(); @@ -224,7 +222,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, const TargetRegisterClass *PtrRC = ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); - unsigned Reg = RegInfo.createVirtualRegister(PtrRC); + Register Reg = RegInfo.createVirtualRegister(PtrRC); const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>( MBB.getParent()->getSubtarget().getInstrInfo()); diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index d021b3d021b1..b9245c9fc0eb 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -69,7 +69,7 @@ void MipsSubtarget::anchor() {} MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little, const MipsTargetMachine &TM, - unsigned StackAlignOverride) + MaybeAlign StackAlignOverride) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault), IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false), NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true), @@ -81,10 +81,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false), HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false), HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false), - StackAlignOverride(StackAlignOverride), - TM(TM), TargetTriple(TT), TSInfo(), - InstrInfo( - MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))), + StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT), + TSInfo(), InstrInfo(MipsInstrInfo::create( + initializeSubtargetDependencies(CPU, FS, TM))), FrameLowering(MipsFrameLowering::create(*this)), TLInfo(MipsTargetLowering::create(TM, *this)) { @@ -248,12 +247,12 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS, InMips16HardFloat = true; if (StackAlignOverride) - stackAlignment = StackAlignOverride; + stackAlignment = *StackAlignOverride; else if (isABI_N32() || isABI_N64()) - stackAlignment = 16; + stackAlignment = Align(16); else { assert(isABI_O32() && "Unknown ABI for stack alignment!"); - stackAlignment = 8; + stackAlignment = Align(8); } return *this; @@ -286,6 +285,6 @@ const RegisterBankInfo *MipsSubtarget::getRegBankInfo() const { return RegBankInfo.get(); } -const InstructionSelector *MipsSubtarget::getInstructionSelector() const { +InstructionSelector *MipsSubtarget::getInstructionSelector() const { return InstSelector.get(); } diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index aa1200579fc8..0a8c2ef8ae5c 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -189,12 +189,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // Disable use of the `jal` instruction. bool UseLongCalls = false; + // Assume 32-bit GOT. + bool UseXGOT = false; + /// The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. - unsigned stackAlignment; + Align stackAlignment; /// The overridden stack alignment. - unsigned StackAlignOverride; + MaybeAlign StackAlignOverride; InstrItineraryData InstrItins; @@ -227,7 +230,7 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little, - const MipsTargetMachine &TM, unsigned StackAlignOverride); + const MipsTargetMachine &TM, MaybeAlign StackAlignOverride); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. @@ -323,6 +326,8 @@ public: bool useLongCalls() const { return UseLongCalls; } + bool useXGOT() const { return UseXGOT; } + bool enableLongBranchPass() const { return hasStandardEncoding() || inMicroMipsMode() || allowMixed16_32(); } @@ -344,7 +349,7 @@ public: // really use them if in addition we are in mips16 mode static bool useConstantIslands(); - unsigned getStackAlignment() const { return stackAlignment; } + Align getStackAlignment() const { return stackAlignment; } // Grab relocation model Reloc::Model getRelocationModel() const; @@ -391,7 +396,7 @@ public: const CallLowering *getCallLowering() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; }; } // End llvm namespace diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index c878abb042e4..e58f316791ba 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -117,14 +117,17 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, getEffectiveRelocModel(JIT, RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()), + isLittle(isLittle), TLOF(std::make_unique<MipsTargetObjectFile>()), ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)), - Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this, - Options.StackAlignmentOverride), + Subtarget(nullptr), + DefaultSubtarget(TT, CPU, FS, isLittle, *this, + MaybeAlign(Options.StackAlignmentOverride)), NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16", - isLittle, *this, Options.StackAlignmentOverride), + isLittle, *this, + MaybeAlign(Options.StackAlignmentOverride)), Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16", - isLittle, *this, Options.StackAlignmentOverride) { + isLittle, *this, + MaybeAlign(Options.StackAlignmentOverride)) { Subtarget = &DefaultSubtarget; initAsmInfo(); } @@ -196,8 +199,9 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, *this, - Options.StackAlignmentOverride); + I = std::make_unique<MipsSubtarget>( + TargetTriple, CPU, FS, isLittle, *this, + MaybeAlign(Options.StackAlignmentOverride)); } return I.get(); } diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index 1fa8ebadd643..298d056ce2c3 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -130,6 +130,8 @@ public: SMLoc IDLoc, const MCSubtargetInfo *STI); void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2, SMLoc IDLoc, const MCSubtargetInfo *STI); + void emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2, + MCOperand Op3, SMLoc IDLoc, const MCSubtargetInfo *STI); void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, SMLoc IDLoc, const MCSubtargetInfo *STI); void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0, @@ -154,17 +156,13 @@ public: unsigned BaseReg, int64_t Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI); - void emitStoreWithSymOffset(unsigned Opcode, unsigned SrcReg, - unsigned BaseReg, MCOperand &HiOperand, - MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc, - const MCSubtargetInfo *STI); + void emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg, unsigned BaseReg, + MCOperand &HiOperand, MCOperand &LoOperand, + unsigned ATReg, SMLoc IDLoc, + const MCSubtargetInfo *STI); void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg, int64_t Offset, unsigned TmpReg, SMLoc IDLoc, const MCSubtargetInfo *STI); - void emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg, - MCOperand &HiOperand, MCOperand &LoOperand, - unsigned ATReg, SMLoc IDLoc, - const MCSubtargetInfo *STI); void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI); void forbidModuleDirective() { ModuleDirectiveAllowed = false; } diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 6530c40ea100..0acbace5f848 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM); -BasicBlockPass *createNVPTXLowerAllocaPass(); +FunctionPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 5f38b4a3c4c5..307f4d58c3ab 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -282,7 +282,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO, } unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { const TargetRegisterClass *RC = MRI->getRegClass(Reg); DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC]; @@ -434,7 +434,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll( return false; } -void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { +void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { AsmPrinter::EmitBasicBlockStart(MBB); if (isLoopHeaderOfNoUnroll(MBB)) OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n")); @@ -507,8 +507,8 @@ const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const { } void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { - unsigned RegNo = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isVirtualRegister(RegNo)) { + Register RegNo = MI->getOperand(0).getReg(); + if (Register::isVirtualRegister(RegNo)) { OutStreamer->AddComment(Twine("implicit-def: ") + getVirtualRegisterName(RegNo)); } else { @@ -1397,7 +1397,7 @@ static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) { auto *FTy = dyn_cast<FunctionType>(Ty); if (FTy) - return DL.getPointerPrefAlignment(); + return DL.getPointerPrefAlignment().value(); return DL.getPrefTypeAlignment(Ty); } @@ -1473,12 +1473,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type - unsigned align = PAL.getParamAlignment(paramIndex); - if (align == 0) - align = DL.getABITypeAlignment(Ty); + const Align align = DL.getValueOrABITypeAlignment( + PAL.getParamAlignment(paramIndex), Ty); unsigned sz = DL.getTypeAllocSize(Ty); - O << "\t.param .align " << align << " .b8 "; + O << "\t.param .align " << align.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1559,9 +1558,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type - unsigned align = PAL.getParamAlignment(paramIndex); - if (align == 0) - align = DL.getABITypeAlignment(ETy); + Align align = + DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy); // Work around a bug in ptxas. When PTX code takes address of // byval parameter with alignment < 4, ptxas generates code to // spill argument into memory. Alas on sm_50+ ptxas generates @@ -1573,10 +1571,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // TODO: this will need to be undone when we get to support multi-TU // device-side compilation as it breaks ABI compatibility with nvcc. // Hopefully ptxas bug is fixed by then. - if (!isKernelFunc && align < 4) - align = 4; + if (!isKernelFunc && align < Align(4)) + align = Align(4); unsigned sz = DL.getTypeAllocSize(ETy); - O << "\t.param .align " << align << " .b8 "; + O << "\t.param .align " << align.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; continue; @@ -1653,7 +1651,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // We use the per class virtual register number in the ptx output. unsigned int numVRs = MRI->getNumVirtRegs(); for (unsigned i = 0; i < numVRs; i++) { - unsigned int vr = TRI->index2VirtReg(i); + unsigned int vr = Register::index2VirtReg(i); const TargetRegisterClass *RC = MRI->getRegClass(vr); DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; int n = regmap.size(); @@ -1861,7 +1859,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::HalfTyID: case Type::FloatTyID: case Type::DoubleTyID: { - const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); + const auto *CFP = cast<ConstantFP>(CPV); Type *Ty = CFP->getType(); if (Ty == Type::getHalfTy(CPV->getContext())) { APInt API = CFP->getValueAPF().bitcastToAPInt(); @@ -2212,7 +2210,7 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, const MachineOperand &MO = MI->getOperand(opNum); switch (MO.getType()) { case MachineOperand::MO_Register: - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (Register::isPhysicalRegister(MO.getReg())) { if (MO.getReg() == NVPTX::VRDepot) O << DEPOTNAME << getFunctionNumber(); else diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 43ae57ac1262..7a66854d32f4 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -200,7 +200,7 @@ private: const Function *F; std::string CurrentFnName; - void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; + void EmitBasicBlockStart(const MachineBasicBlock &MBB) override; void EmitFunctionEntryLabel() override; void EmitFunctionBodyStart() override; void EmitFunctionBodyEnd() override; diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 46f08b23d31a..d26912f47e50 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -25,7 +25,7 @@ using namespace llvm; NVPTXFrameLowering::NVPTXFrameLowering() - : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {} + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {} bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index ae1aa98da0e8..9acd0bea66fd 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -480,7 +480,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); // Register custom handling for vector loads/stores - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { if (IsPTXVectorType(VT)) { setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); @@ -1291,8 +1291,8 @@ std::string NVPTXTargetLowering::getPrototype( O << ".param .b" << size << " _"; } else if (isa<PointerType>(retTy)) { O << ".param .b" << PtrVT.getSizeInBits() << " _"; - } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) { - auto &DL = CS.getCalledFunction()->getParent()->getDataLayout(); + } else if (retTy->isAggregateType() || retTy->isVectorTy() || + retTy->isIntegerTy(128)) { O << ".param .align " << retAlignment << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; } else { @@ -2230,8 +2230,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::v2f16) { LoadSDNode *Load = cast<LoadSDNode>(Op); EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *Load->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, SDLoc(Op)); @@ -2273,8 +2273,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // v2f16 is legal, so we can't rely on legalizer to handle unaligned // stores and have to handle it here. if (VT == MVT::v2f16 && - !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *Store->getMemOperand())) + !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); if (VT.isVector()) @@ -3497,7 +3497,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: @@ -3521,7 +3521,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 8; + Info.align = Align(8); return true; } @@ -3547,7 +3547,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3585,7 +3585,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 4; + Info.align = Align(4); return true; } @@ -3606,7 +3606,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3627,7 +3627,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3648,7 +3648,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3665,7 +3665,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 8; + Info.align = Align(8); return true; } @@ -3686,7 +3686,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3707,7 +3707,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3728,7 +3728,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3745,7 +3745,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 8; + Info.align = Align(8); return true; } @@ -3780,7 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - Info.align = 0; + Info.align.reset(); return true; } @@ -3798,7 +3798,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + Info.align = + MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); return true; } @@ -3817,7 +3818,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + Info.align = + MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); return true; } @@ -3883,7 +3885,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_tex_1d_v4s32_s32: @@ -4003,7 +4005,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i8_clamp: @@ -4056,7 +4058,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i16_clamp: @@ -4109,7 +4111,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i32_clamp: @@ -4162,7 +4164,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i64_clamp: @@ -4200,7 +4202,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } return false; diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 62da3c79f465..fe7a84f9a361 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; +def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ +def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" + "&& Subtarget->getPTXVersion() >= 64)">; + def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; @@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the // ptx value to 64 bits to match the ISD node's semantics, unless we know we're // truncating back down to 32 bits. -def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the @@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; // and then ctlz that value. This way we don't have to subtract 16 from the // result. Unfortunately today we don't have a way to generate // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. -def : Pat<(ctlz Int16Regs:$a), +def : Pat<(i16 (ctlz Int16Regs:$a)), (SUBi16ri (CVT_u16_u32 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(i32 (zext (ctlz Int16Regs:$a))), +def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; // Population count @@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; // If we know that we're storing into an i32, we can avoid the final trunc. def : Pat<(ctpop Int16Regs:$a), (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; -def : Pat<(i32 (zext (ctpop Int16Regs:$a))), +def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; // fpround f32 -> f16 diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 1752d3e0575e..c52195fb0449 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> { []); } +class THREADMASK_INFO<bit sync> { + list<bit> ret = !if(sync, [0,1], [0]); +} + //----------------------------------- // Synchronization and shuffle functions //----------------------------------- @@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt), [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, Requires<[hasPTX60, hasSM30]>; - -// shfl.{up,down,bfly,idx}.b32 -multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { - // The last two parameters to shfl can be regs or imms. ptxas is smart - // enough to inline constant registers, so strictly speaking we don't need to - // handle immediates here. But it's easy enough, and it makes our ptx more - // readable. - def reg : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>; - - def imm1 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>; - - def imm2 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>; - - def imm3 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>; +class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred, + bit offset_imm, bit mask_imm, bit threadmask_imm> + : NVPTXInst<(outs), (ins), "?", []> { + NVPTXRegClass rc = !cond( + !eq(reg, "i32"): Int32Regs, + !eq(reg, "f32"): Float32Regs); + string IntrName = "int_nvvm_shfl_" + # !if(sync, "sync_", "") + # mode + # "_" # reg + # !if(return_pred, "p", ""); + Intrinsic Intr = !cast<Intrinsic>(IntrName); + let InOperandList = !con( + !if(sync, + !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]), + (ins)), + (ins rc:$src), + !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]), + !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"]) + ); + let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst)); + let AsmString = "shfl." + # !if(sync, "sync.", "") + # mode # ".b32\t" + # "$dst" + # !if(return_pred, "|$pred", "") # ", " + # "$src, $offset, $mask" + # !if(sync, ", $threadmask", "") + # ";" + ; + let Pattern = [!con( + !foreach(tmp, OutOperandList, + !subst(outs, set, + !subst(i32imm, imm, tmp))), + (set !foreach(tmp, InOperandList, + !subst(ins, Intr, + !subst(i32imm, imm, tmp)))) + )]; } -defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>; -defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>; -defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>; -defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>; -defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>; -defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>; -defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>; -defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>; - -multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { - // Threadmask and the last two parameters to shfl.sync can be regs or imms. - // ptxas is smart enough to inline constant registers, so strictly speaking we - // don't need to handle immediates here. But it's easy enough, and it makes - // our ptx more readable. - def rrr : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def rri : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def rir : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def rii : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; - - def irr : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def iri : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def iir : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def iii : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; +foreach sync = [0, 1] in { + foreach mode = ["up", "down", "bfly", "idx"] in { + foreach regclass = ["i32", "f32"] in { + foreach return_pred = [0, 1] in { + foreach offset_imm = [0, 1] in { + foreach mask_imm = [0, 1] in { + foreach threadmask_imm = THREADMASK_INFO<sync>.ret in { + def : SHFL_INSTR<sync, mode, regclass, return_pred, + offset_imm, mask_imm, threadmask_imm>, + Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>; + } + } + } + } + } + } } -// On sm_70 these don't have to be convergent, so we may eventually want to -// implement non-convergent variant of this intrinsic. -defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>; -defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>; -defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>; -defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>; -defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>; -defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>; -defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>; -defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>; - - // vote.{all,any,uni,ballot} multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred), diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 0743a2986718..83039241a7c7 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -103,7 +103,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { // Do the transformation of an aggr load/copy/set to a loop // for (LoadInst *LI : AggrLoads) { - StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin()); + auto *SI = cast<StoreInst>(*LI->user_begin()); Value *SrcAddr = LI->getOperand(0); Value *DstAddr = SI->getOperand(1); unsigned NumLoads = DL.getTypeStoreSize(LI->getType()); diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 76fb9f3fa692..945b7286b03c 100644 --- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -41,12 +41,12 @@ void initializeNVPTXLowerAllocaPass(PassRegistry &); } namespace { -class NVPTXLowerAlloca : public BasicBlockPass { - bool runOnBasicBlock(BasicBlock &BB) override; +class NVPTXLowerAlloca : public FunctionPass { + bool runOnFunction(Function &F) override; public: static char ID; // Pass identification, replacement for typeid - NVPTXLowerAlloca() : BasicBlockPass(ID) {} + NVPTXLowerAlloca() : FunctionPass(ID) {} StringRef getPassName() const override { return "convert address space of alloca'ed memory to local"; } @@ -61,58 +61,61 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca", // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { - if (skipBasicBlock(BB)) +bool NVPTXLowerAlloca::runOnFunction(Function &F) { + if (skipFunction(F)) return false; bool Changed = false; - for (auto &I : BB) { - if (auto allocaInst = dyn_cast<AllocaInst>(&I)) { - Changed = true; - auto PTy = dyn_cast<PointerType>(allocaInst->getType()); - auto ETy = PTy->getElementType(); - auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); - auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); - auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); - auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal, - GenericAddrTy, ""); - NewASCToLocal->insertAfter(allocaInst); - NewASCToGeneric->insertAfter(NewASCToLocal); - for (Value::use_iterator UI = allocaInst->use_begin(), - UE = allocaInst->use_end(); - UI != UE; ) { - // Check Load, Store, GEP, and BitCast Uses on alloca and make them - // use the converted generic address, in order to expose non-generic - // addrspacecast to NVPTXInferAddressSpaces. For other types - // of instructions this is unnecessary and may introduce redundant - // address cast. - const auto &AllocaUse = *UI++; - auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); - if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) { - LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric); - continue; - } - auto SI = dyn_cast<StoreInst>(AllocaUse.getUser()); - if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) { - SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); - continue; - } - auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser()); - if (GI && GI->getPointerOperand() == allocaInst) { - GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric); - continue; - } - auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser()); - if (BI && BI->getOperand(0) == allocaInst) { - BI->setOperand(0, NewASCToGeneric); - continue; + for (auto &BB : F) + for (auto &I : BB) { + if (auto allocaInst = dyn_cast<AllocaInst>(&I)) { + Changed = true; + auto PTy = dyn_cast<PointerType>(allocaInst->getType()); + auto ETy = PTy->getElementType(); + auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); + auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); + auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); + auto NewASCToGeneric = + new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, ""); + NewASCToLocal->insertAfter(allocaInst); + NewASCToGeneric->insertAfter(NewASCToLocal); + for (Value::use_iterator UI = allocaInst->use_begin(), + UE = allocaInst->use_end(); + UI != UE;) { + // Check Load, Store, GEP, and BitCast Uses on alloca and make them + // use the converted generic address, in order to expose non-generic + // addrspacecast to NVPTXInferAddressSpaces. For other types + // of instructions this is unnecessary and may introduce redundant + // address cast. + const auto &AllocaUse = *UI++; + auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); + if (LI && LI->getPointerOperand() == allocaInst && + !LI->isVolatile()) { + LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto SI = dyn_cast<StoreInst>(AllocaUse.getUser()); + if (SI && SI->getPointerOperand() == allocaInst && + !SI->isVolatile()) { + SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser()); + if (GI && GI->getPointerOperand() == allocaInst) { + GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser()); + if (BI && BI->getOperand(0) == allocaInst) { + BI->setOperand(0, NewASCToGeneric); + continue; + } } } } - } return Changed; } -BasicBlockPass *llvm::createNVPTXLowerAllocaPass() { +FunctionPass *llvm::createNVPTXLowerAllocaPass() { return new NVPTXLowerAlloca(); } diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp index c5e02e34e25e..c3c5f6fbcba7 100644 --- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -164,7 +164,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) { // Set the alignment to alignment of the byval parameter. This is because, // later load/stores assume that alignment, and we are going to replace // the use of the byval parameter with this alloca instruction. - AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo())); + AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo()))); Arg->replaceAllUsesWith(AllocA); Value *ArgInParam = new AddrSpaceCastInst( diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp index 629757db8707..5e6411c61eab 100644 --- a/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -81,7 +81,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { auto &Op = Root.getOperand(1); const auto &MRI = MF.getRegInfo(); MachineInstr *GenericAddrDef = nullptr; - if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) { GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); } diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 4c5a9adf1f65..a7127b0e9a99 100644 --- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -178,7 +178,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) { // frame index registers. Functions which don't want/need this optimization // will continue to use the existing code path. if (MFI.getUseLocalStackAllocationBlock()) { - unsigned Align = MFI.getLocalFrameMaxAlign(); + unsigned Align = MFI.getLocalFrameMaxAlign().value(); // Adjust to alignment boundary. Offset = (Offset + Align - 1) / Align * Align; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 11b3fe2fa3d3..f58fb5717773 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -116,7 +116,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, CPU, FS, Options, Reloc::PIC_, getEffectiveCodeModel(CM, CodeModel::Small), OL), is64bit(is64bit), UseShortPointers(UseShortPointersOpt), - TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), + TLOF(std::make_unique<NVPTXTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp index 665eb1383253..43c2e9920403 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -19,10 +19,11 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/MutexGuard.h" +#include "llvm/Support/Mutex.h" #include <algorithm> #include <cstring> #include <map> +#include <mutex> #include <string> #include <vector> @@ -38,12 +39,12 @@ static ManagedStatic<per_module_annot_t> annotationCache; static sys::Mutex Lock; void clearAnnotationCache(const Module *Mod) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); annotationCache->erase(Mod); } static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); assert(md && "Invalid mdnode for annotation"); assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); // start index = 1, to skip the global variable key @@ -69,7 +70,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { } static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations"); if (!NMD) return; @@ -103,7 +104,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, unsigned &retval) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); const Module *m = gv->getParent(); if ((*annotationCache).find(m) == (*annotationCache).end()) cacheAnnotationFromMD(m, gv); @@ -117,7 +118,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); const Module *m = gv->getParent(); if ((*annotationCache).find(m) == (*annotationCache).end()) cacheAnnotationFromMD(m, gv); diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index c9524da93acd..aedf5b713c3f 100644 --- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -579,7 +579,7 @@ public: static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S, bool IsPPC64) { - auto Op = make_unique<PPCOperand>(Token); + auto Op = std::make_unique<PPCOperand>(Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -608,7 +608,7 @@ public: static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) { - auto Op = make_unique<PPCOperand>(Immediate); + auto Op = std::make_unique<PPCOperand>(Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -618,7 +618,7 @@ public: static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) { - auto Op = make_unique<PPCOperand>(Expression); + auto Op = std::make_unique<PPCOperand>(Expression); Op->Expr.Val = Val; Op->Expr.CRVal = EvaluateCRExpr(Val); Op->StartLoc = S; @@ -629,7 +629,7 @@ public: static std::unique_ptr<PPCOperand> CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) { - auto Op = make_unique<PPCOperand>(TLSRegister); + auto Op = std::make_unique<PPCOperand>(TLSRegister); Op->TLSReg.Sym = Sym; Op->StartLoc = S; Op->EndLoc = E; @@ -639,7 +639,7 @@ public: static std::unique_ptr<PPCOperand> CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) { - auto Op = make_unique<PPCOperand>(ContextImmediate); + auto Op = std::make_unique<PPCOperand>(ContextImmediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 7a8af57961cb..3597fd15eeb1 100644 --- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -167,12 +167,6 @@ static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, QFRegs); } -static DecodeStatus DecodeSPE4RCRegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, - const void *Decoder) { - return decodeRegisterClass(Inst, RegNo, RRegs); -} - static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 042ddf48d5df..20f752c3041a 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -78,7 +78,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, // determine the type of the relocation unsigned Type; if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: llvm_unreachable("Unimplemented"); case PPC::fixup_ppc_br24: @@ -131,7 +131,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, break; } } else { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: llvm_unreachable("invalid fixup kind!"); case FK_NONE: Type = ELF::R_PPC_NONE; @@ -443,5 +443,5 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, std::unique_ptr<MCObjectTargetWriter> llvm::createPPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) { - return llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI); + return std::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index 0e64ae55ab1c..7fc231618fa9 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -66,6 +66,31 @@ void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) { + // Customize printing of the addis instruction on AIX. When an operand is a + // symbol reference, the instruction syntax is changed to look like a load + // operation, i.e: + // Transform: addis $rD, $rA, $src --> addis $rD, $src($rA). + if (TT.isOSAIX() && + (MI->getOpcode() == PPC::ADDIS8 || MI->getOpcode() == PPC::ADDIS) && + MI->getOperand(2).isExpr()) { + assert((MI->getOperand(0).isReg() && MI->getOperand(1).isReg()) && + "The first and the second operand of an addis instruction" + " should be registers."); + + assert(isa<MCSymbolRefExpr>(MI->getOperand(2).getExpr()) && + "The third operand of an addis instruction should be a symbol " + "reference expression if it is an expression at all."); + + O << "\taddis "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 2, O); + O << "("; + printOperand(MI, 1, O); + O << ")"; + return; + } + // Check for slwi/srwi mnemonics. if (MI->getOpcode() == PPC::RLWINM) { unsigned char SH = MI->getOperand(2).getImm(); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 5f0005ea1d7b..1216cd727289 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -86,4 +86,5 @@ void PPCXCOFFMCAsmInfo::anchor() {} PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { assert(!IsLittleEndian && "Little-endian XCOFF not supported."); CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4; + ZeroDirective = "\t.space\t"; } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp index d467f5c4a439..fb9dd5d7aa75 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -19,8 +19,8 @@ using namespace llvm; const PPCMCExpr* PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin); + bool IsDarwin, MCContext &Ctx) { + return new (Ctx) PPCMCExpr(Kind, Expr, IsDarwin); } void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index 449e2c34f74d..ad1454566162 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -45,21 +45,21 @@ public: /// @{ static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr, - bool isDarwin, MCContext &Ctx); + bool IsDarwin, MCContext &Ctx); static const PPCMCExpr *createLo(const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return create(VK_PPC_LO, Expr, isDarwin, Ctx); + bool IsDarwin, MCContext &Ctx) { + return create(VK_PPC_LO, Expr, IsDarwin, Ctx); } static const PPCMCExpr *createHi(const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return create(VK_PPC_HI, Expr, isDarwin, Ctx); + bool IsDarwin, MCContext &Ctx) { + return create(VK_PPC_HI, Expr, IsDarwin, Ctx); } static const PPCMCExpr *createHa(const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return create(VK_PPC_HA, Expr, isDarwin, Ctx); + bool IsDarwin, MCContext &Ctx) { + return create(VK_PPC_HA, Expr, IsDarwin, Ctx); } /// @} diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index 4cf7fd15fa75..672f910ab086 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -178,7 +178,7 @@ static uint32_t getFixupOffset(const MCAsmLayout &Layout, uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); // On Mach-O, ppc_fixup_half16 relocations must refer to the // start of the instruction, not the second halfword, as ELF does - if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16) + if (Fixup.getTargetKind() == PPC::fixup_ppc_half16) FixupOffset &= ~uint32_t(3); return FixupOffset; } @@ -376,5 +376,5 @@ void PPCMachObjectWriter::RecordPPCRelocation( std::unique_ptr<MCObjectTargetWriter> llvm::createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) { - return llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype); + return std::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index 9c661286d455..7fdbb8990b55 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -25,5 +25,5 @@ PPCXCOFFObjectWriter::PPCXCOFFObjectWriter(bool Is64Bit) std::unique_ptr<MCObjectTargetWriter> llvm::createPPCXCOFFObjectWriter(bool Is64Bit) { - return llvm::make_unique<PPCXCOFFObjectWriter>(Is64Bit); + return std::make_unique<PPCXCOFFObjectWriter>(Is64Bit); } diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td index 2a10322d3f49..f6cd8ed00c82 100644 --- a/lib/Target/PowerPC/P9InstrResources.td +++ b/lib/Target/PowerPC/P9InstrResources.td @@ -64,6 +64,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], XXLAND, XXLANDC, XXLEQV, + XXLEQVOnes, XXLNAND, XXLNOR, XXLOR, @@ -124,8 +125,8 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], (instregex "SRAD(I)?$"), (instregex "EXTSWSLI_32_64$"), (instregex "MFV(S)?RD$"), - (instregex "MTVSRD$"), - (instregex "MTVSRW(A|Z)$"), + (instregex "MTV(S)?RD$"), + (instregex "MTV(S)?RW(A|Z)$"), (instregex "CMP(WI|LWI|W|LW)(8)?$"), (instregex "CMP(L)?D(I)?$"), (instregex "SUBF(I)?C(8)?$"), @@ -148,7 +149,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], (instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(o)?$"), (instregex "ADD(4|8)(TLS)?(_)?$"), (instregex "NEG(8)?$"), - (instregex "ADDI(S)?toc(HA|L)$"), + (instregex "ADDI(S)?toc(HA|L)(8)?$"), COPY, MCRF, MCRXRX, @@ -158,6 +159,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], XSNEGDP, XSCPSGNDP, MFVSRWZ, + MFVRWZ, EXTSWSLI, SRADI_32, RLDIC, diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index c6951ab67b08..0534773c4c9e 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -50,10 +50,10 @@ namespace llvm { FunctionPass *createPPCExpandISELPass(); FunctionPass *createPPCPreEmitPeepholePass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - AsmPrinter &AP, bool isDarwin); + AsmPrinter &AP, bool IsDarwin); bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &OutMO, AsmPrinter &AP, - bool isDarwin); + bool IsDarwin); void initializePPCCTRLoopsPass(PassRegistry&); #ifndef NDEBUG @@ -86,8 +86,8 @@ namespace llvm { MO_NO_FLAG, /// On a symbol operand "FOO", this indicates that the reference is actually - /// to "FOO@plt". This is used for calls and jumps to external functions on - /// for PIC calls on Linux and ELF systems. + /// to "FOO@plt". This is used for calls and jumps to external functions + /// and for PIC calls on 32-bit ELF systems. MO_PLT = 1, /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index bd87ce06b4fb..66236b72a1a3 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -51,9 +51,11 @@ #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/SectionKind.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" @@ -76,7 +78,7 @@ namespace { class PPCAsmPrinter : public AsmPrinter { protected: - MapVector<MCSymbol *, MCSymbol *> TOC; + MapVector<const MCSymbol *, MCSymbol *> TOC; const PPCSubtarget *Subtarget; StackMaps SM; @@ -87,7 +89,7 @@ public: StringRef getPassName() const override { return "PowerPC Assembly Printer"; } - MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym); + MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym); bool doInitialization(Module &M) override { if (!TOC.empty()) @@ -164,6 +166,14 @@ public: : PPCAsmPrinter(TM, std::move(Streamer)) {} StringRef getPassName() const override { return "AIX PPC Assembly Printer"; } + + void SetupMachineFunction(MachineFunction &MF) override; + + void EmitGlobalVariable(const GlobalVariable *GV) override; + + void EmitFunctionDescriptor() override; + + void EmitEndOfAsmFile(Module &) override; }; } // end anonymous namespace @@ -265,7 +275,7 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return true; // This operand uses VSX numbering. // If the operand is a VMX register, convert it to a VSX register. - unsigned Reg = MI->getOperand(OpNo).getReg(); + Register Reg = MI->getOperand(OpNo).getReg(); if (PPCInstrInfo::isVRRegister(Reg)) Reg = PPC::VSX32 + (Reg - PPC::V0); else if (PPCInstrInfo::isVFRegister(Reg)) @@ -328,7 +338,7 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry /// exists for it. If not, create one. Then return a symbol that references /// the TOC entry. -MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) { +MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(const MCSymbol *Sym) { MCSymbol *&TOCEntry = TOC[Sym]; if (!TOCEntry) TOCEntry = createTempSymbol("C"); @@ -378,7 +388,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) { if (CallTarget) { assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && "High 16 bits of call target should be zero."); - unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 0; // Materialize the jump address: EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8) @@ -502,13 +512,32 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, .addExpr(SymVar)); } +/// Map a machine operand for a TOC pseudo-machine instruction to its +/// corresponding MCSymbol. +static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO, + AsmPrinter &AP) { + switch (MO.getType()) { + case MachineOperand::MO_GlobalAddress: + return AP.getSymbol(MO.getGlobal()); + case MachineOperand::MO_ConstantPoolIndex: + return AP.GetCPISymbol(MO.getIndex()); + case MachineOperand::MO_JumpTableIndex: + return AP.GetJTISymbol(MO.getIndex()); + case MachineOperand::MO_BlockAddress: + return AP.GetBlockAddressSymbol(MO.getBlockAddress()); + default: + llvm_unreachable("Unexpected operand type to get symbol."); + } +} + /// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to /// the current output stream. /// void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst TmpInst; - bool isPPC64 = Subtarget->isPPC64(); - bool isDarwin = TM.getTargetTriple().isOSDarwin(); + const bool IsDarwin = TM.getTargetTriple().isOSDarwin(); + const bool IsPPC64 = Subtarget->isPPC64(); + const bool IsAIX = Subtarget->isAIXABI(); const Module *M = MF->getFunction().getParent(); PICLevel::Level PL = M->getPICLevel(); @@ -517,7 +546,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (!MI->isInlineAsm()) { for (const MachineOperand &MO: MI->operands()) { if (MO.isReg()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Subtarget->hasSPE()) { if (PPC::F4RCRegClass.contains(Reg) || PPC::F8RCRegClass.contains(Reg) || @@ -595,7 +624,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { // addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha // addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l // Get the offset from the GOT Base Register to the GOT - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); if (Subtarget->isSecurePlt() && isPositionIndependent() ) { unsigned PICR = TmpInst.getOperand(0).getReg(); MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol( @@ -646,43 +675,57 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } } case PPC::LWZtoc: { - // Transform %r3 = LWZtoc @min1, %r2 - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + assert(!IsDarwin && "TOC is an ELF/XCOFF construct."); - // Change the opcode to LWZ, and the global address operand to be a - // reference to the GOT entry we will synthesize later. + // Transform %rN = LWZtoc @op1, %r2 + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); + + // Change the opcode to LWZ. TmpInst.setOpcode(PPC::LWZ); + const MachineOperand &MO = MI->getOperand(1); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && + "Invalid operand for LWZtoc."); - // Map symbol -> label of TOC entry - assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()); - MCSymbol *MOSymbol = nullptr; - if (MO.isGlobal()) - MOSymbol = getSymbol(MO.getGlobal()); - else if (MO.isCPI()) - MOSymbol = GetCPISymbol(MO.getIndex()); - else if (MO.isJTI()) - MOSymbol = GetJTISymbol(MO.getIndex()); - else if (MO.isBlockAddress()) - MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + // Map the operand to its corresponding MCSymbol. + const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); - if (PL == PICLevel::SmallPIC) { + // Create a reference to the GOT entry for the symbol. The GOT entry will be + // synthesized later. + if (PL == PICLevel::SmallPIC && !IsAIX) { const MCExpr *Exp = MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT, OutContext); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); - } else { - MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } - const MCExpr *Exp = - MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None, - OutContext); - const MCExpr *PB = - MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")), - OutContext); - Exp = MCBinaryExpr::createSub(Exp, PB, OutContext); + // Otherwise, use the TOC. 'TOCEntry' is a label used to reference the + // storage allocated in the TOC which contains the address of + // 'MOSymbol'. Said TOC entry will be synthesized later. + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + const MCExpr *Exp = + MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None, OutContext); + + // AIX uses the label directly as the lwz displacement operand for + // references into the toc section. The displacement value will be generated + // relative to the toc-base. + if (IsAIX) { + assert( + TM.getCodeModel() == CodeModel::Small && + "This pseudo should only be selected for 32-bit small code model."); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; } + + // Create an explicit subtract expression between the local symbol and + // '.LTOC' to manifest the toc-relative offset. + const MCExpr *PB = MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol(Twine(".LTOC")), OutContext); + Exp = MCBinaryExpr::createSub(Exp, PB, OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; } @@ -690,72 +733,121 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::LDtocCPT: case PPC::LDtocBA: case PPC::LDtoc: { + assert(!IsDarwin && "TOC is an ELF/XCOFF construct"); + // Transform %x3 = LDtoc @min1, %x2 - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); - // Change the opcode to LD, and the global address operand to be a - // reference to the TOC entry we will synthesize later. + // Change the opcode to LD. TmpInst.setOpcode(PPC::LD); + const MachineOperand &MO = MI->getOperand(1); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && + "Invalid operand!"); + + // Map the machine operand to its corresponding MCSymbol, then map the + // global address operand to be a reference to the TOC entry we will + // synthesize later. + MCSymbol *TOCEntry = + lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this)); + + const MCSymbolRefExpr::VariantKind VK = + IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC; + const MCExpr *Exp = + MCSymbolRefExpr::create(TOCEntry, VK, OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::ADDIStocHA: { + assert((IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large) && + "This pseudo should only be selected for 32-bit large code model on" + " AIX."); + + // Transform %rd = ADDIStocHA %rA, @sym(%r2) + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); + + // Change the opcode to ADDIS. + TmpInst.setOpcode(PPC::ADDIS); + + const MachineOperand &MO = MI->getOperand(2); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && + "Invalid operand for ADDIStocHA."); - // Map symbol -> label of TOC entry - assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()); - MCSymbol *MOSymbol = nullptr; - if (MO.isGlobal()) - MOSymbol = getSymbol(MO.getGlobal()); - else if (MO.isCPI()) - MOSymbol = GetCPISymbol(MO.getIndex()); - else if (MO.isJTI()) - MOSymbol = GetJTISymbol(MO.getIndex()); - else if (MO.isBlockAddress()) - MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + // Map the machine operand to its corresponding MCSymbol. + MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + // Always use TOC on AIX. Map the global address operand to be a reference + // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to + // reference the storage allocated in the TOC which contains the address of + // 'MOSymbol'. MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry, + MCSymbolRefExpr::VK_PPC_U, + OutContext); + TmpInst.getOperand(2) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::LWZtocL: { + assert(IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large && + "This pseudo should only be selected for 32-bit large code model on" + " AIX."); - const MCExpr *Exp = - MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC, - OutContext); + // Transform %rd = LWZtocL @sym, %rs. + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); + + // Change the opcode to lwz. + TmpInst.setOpcode(PPC::LWZ); + + const MachineOperand &MO = MI->getOperand(1); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && + "Invalid operand for LWZtocL."); + + // Map the machine operand to its corresponding MCSymbol. + MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + + // Always use TOC on AIX. Map the global address operand to be a reference + // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to + // reference the storage allocated in the TOC which contains the address of + // 'MOSymbol'. + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry, + MCSymbolRefExpr::VK_PPC_L, + OutContext); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; } + case PPC::ADDIStocHA8: { + assert(!IsDarwin && "TOC is an ELF/XCOFF construct"); - case PPC::ADDIStocHA: { - // Transform %xd = ADDIStocHA %x2, @sym - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + // Transform %xd = ADDIStocHA8 %x2, @sym + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); - // Change the opcode to ADDIS8. If the global address is external, has - // common linkage, is a non-local function address, or is a jump table - // address, then generate a TOC entry and reference that. Otherwise - // reference the symbol directly. + // Change the opcode to ADDIS8. If the global address is the address of + // an external symbol, is a jump table address, is a block address, or is a + // constant pool index with large code model enabled, then generate a TOC + // entry and reference that. Otherwise, reference the symbol directly. TmpInst.setOpcode(PPC::ADDIS8); + const MachineOperand &MO = MI->getOperand(2); - assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || - MO.isBlockAddress()) && - "Invalid operand for ADDIStocHA!"); - MCSymbol *MOSymbol = nullptr; - bool GlobalToc = false; + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && + "Invalid operand for ADDIStocHA8!"); - if (MO.isGlobal()) { - const GlobalValue *GV = MO.getGlobal(); - MOSymbol = getSymbol(GV); - unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); - GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG); - } else if (MO.isCPI()) { - MOSymbol = GetCPISymbol(MO.getIndex()); - } else if (MO.isJTI()) { - MOSymbol = GetJTISymbol(MO.getIndex()); - } else if (MO.isBlockAddress()) { - MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); - } + const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + const bool GlobalToc = + MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal()); if (GlobalToc || MO.isJTI() || MO.isBlockAddress() || - TM.getCodeModel() == CodeModel::Large) + (MO.isCPI() && TM.getCodeModel() == CodeModel::Large)) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + const MCSymbolRefExpr::VariantKind VK = + IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA; + const MCExpr *Exp = - MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA, - OutContext); + MCSymbolRefExpr::create(MOSymbol, VK, OutContext); if (!MO.isJTI() && MO.getOffset()) Exp = MCBinaryExpr::createAdd(Exp, @@ -768,73 +860,59 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::LDtocL: { + assert(!IsDarwin && "TOC is an ELF/XCOFF construct"); + // Transform %xd = LDtocL @sym, %xs - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); - // Change the opcode to LD. If the global address is external, has - // common linkage, or is a jump table address, then reference the - // associated TOC entry. Otherwise reference the symbol directly. + // Change the opcode to LD. If the global address is the address of + // an external symbol, is a jump table address, is a block address, or is + // a constant pool index with large code model enabled, then generate a + // TOC entry and reference that. Otherwise, reference the symbol directly. TmpInst.setOpcode(PPC::LD); + const MachineOperand &MO = MI->getOperand(1); assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && "Invalid operand for LDtocL!"); - MCSymbol *MOSymbol = nullptr; - if (MO.isJTI()) - MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex())); - else if (MO.isBlockAddress()) { - MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); - MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); - } - else if (MO.isCPI()) { - MOSymbol = GetCPISymbol(MO.getIndex()); - if (TM.getCodeModel() == CodeModel::Large) - MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); - } - else if (MO.isGlobal()) { - const GlobalValue *GV = MO.getGlobal(); - MOSymbol = getSymbol(GV); - LLVM_DEBUG( - unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); - assert((GVFlags & PPCII::MO_NLP_FLAG) && - "LDtocL used on symbol that could be accessed directly is " - "invalid. Must match ADDIStocHA.")); + LLVM_DEBUG(assert( + (!MO.isGlobal() || Subtarget->isGVIndirectSymbol(MO.getGlobal())) && + "LDtocL used on symbol that could be accessed directly is " + "invalid. Must match ADDIStocHA8.")); + + const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + + if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); - } + const MCSymbolRefExpr::VariantKind VK = + IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO; const MCExpr *Exp = - MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, - OutContext); + MCSymbolRefExpr::create(MOSymbol, VK, OutContext); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; } case PPC::ADDItocL: { // Transform %xd = ADDItocL %xs, @sym - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); - // Change the opcode to ADDI8. If the global address is external, then - // generate a TOC entry and reference that. Otherwise reference the + // Change the opcode to ADDI8. If the global address is external, then + // generate a TOC entry and reference that. Otherwise, reference the // symbol directly. TmpInst.setOpcode(PPC::ADDI8); + const MachineOperand &MO = MI->getOperand(2); - assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL"); - MCSymbol *MOSymbol = nullptr; + assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL."); - if (MO.isGlobal()) { - const GlobalValue *GV = MO.getGlobal(); - LLVM_DEBUG(unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); - assert(!(GVFlags & PPCII::MO_NLP_FLAG) && - "Interposable definitions must use indirect access.")); - MOSymbol = getSymbol(GV); - } else if (MO.isCPI()) { - MOSymbol = GetCPISymbol(MO.getIndex()); - } + LLVM_DEBUG(assert( + !(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) && + "Interposable definitions must use indirect access.")); const MCExpr *Exp = - MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, - OutContext); + MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this), + MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext); TmpInst.getOperand(2) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; @@ -842,13 +920,13 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::ADDISgotTprelHA: { // Transform: %xd = ADDISgotTprelHA %x2, @sym // Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha - assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); + assert(IsPPC64 && "Not supported for 32-bit PowerPC"); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymGotTprel = - MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA, - OutContext); + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA, + OutContext); EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) @@ -858,16 +936,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::LDgotTprelL: case PPC::LDgotTprelL32: { // Transform %xd = LDgotTprelL @sym, %xs - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to LD. - TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ); + TmpInst.setOpcode(IsPPC64 ? PPC::LD : PPC::LWZ); const MachineOperand &MO = MI->getOperand(1); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); - const MCExpr *Exp = - MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO, - OutContext); + const MCExpr *Exp = MCSymbolRefExpr::create( + MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO + : MCSymbolRefExpr::VK_PPC_GOT_TPREL, + OutContext); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; @@ -920,7 +999,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::ADDIStlsgdHA: { // Transform: %xd = ADDIStlsgdHA %x2, @sym // Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha - assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); + assert(IsPPC64 && "Not supported for 32-bit PowerPC"); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -943,11 +1022,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create( - MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO - : MCSymbolRefExpr::VK_PPC_GOT_TLSGD, + MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO + : MCSymbolRefExpr::VK_PPC_GOT_TLSGD, OutContext); EmitToStreamer(*OutStreamer, - MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI) + MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) .addExpr(SymGotTlsGD)); @@ -965,7 +1044,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::ADDIStlsldHA: { // Transform: %xd = ADDIStlsldHA %x2, @sym // Into: %xd = ADDIS8 %x2, sym@got@tlsld@ha - assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); + assert(IsPPC64 && "Not supported for 32-bit PowerPC"); const MachineOperand &MO = MI->getOperand(2); const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); @@ -988,11 +1067,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { const GlobalValue *GValue = MO.getGlobal(); MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create( - MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO - : MCSymbolRefExpr::VK_PPC_GOT_TLSLD, + MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO + : MCSymbolRefExpr::VK_PPC_GOT_TLSLD, OutContext); EmitToStreamer(*OutStreamer, - MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI) + MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) .addExpr(SymGotTlsLD)); @@ -1021,7 +1100,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { OutContext); EmitToStreamer( *OutStreamer, - MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS) + MCInstBuilder(IsPPC64 ? PPC::ADDIS8 : PPC::ADDIS) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) .addExpr(SymDtprel)); @@ -1040,7 +1119,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO, OutContext); EmitToStreamer(*OutStreamer, - MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI) + MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) .addExpr(SymDtprel)); @@ -1087,7 +1166,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { // suite shows a handful of test cases that fail this check for // Darwin. Those need to be investigated before this sanity test // can be enabled for those subtargets. - if (!Subtarget->isDarwin()) { + if (!IsDarwin) { unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; const MachineOperand &MO = MI->getOperand(OpNum); if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4) @@ -1098,7 +1177,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } } - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); EmitToStreamer(*OutStreamer, TmpInst); } @@ -1368,15 +1447,16 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) { ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); OutStreamer->SwitchSection(Section); - for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(), - E = TOC.end(); I != E; ++I) { - OutStreamer->EmitLabel(I->second); - MCSymbol *S = I->first; + for (const auto &TOCMapPair : TOC) { + const MCSymbol *const TOCEntryTarget = TOCMapPair.first; + MCSymbol *const TOCEntryLabel = TOCMapPair.second; + + OutStreamer->EmitLabel(TOCEntryLabel); if (isPPC64) { - TS.emitTCEntry(*S); + TS.emitTCEntry(*TOCEntryTarget); } else { OutStreamer->EmitValueToAlignment(4); - OutStreamer->EmitSymbolValue(S, 4); + OutStreamer->EmitSymbolValue(TOCEntryTarget, 4); } } } @@ -1602,7 +1682,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); - EmitAlignment(isPPC64 ? 3 : 2); + EmitAlignment(isPPC64 ? Align(8) : Align(4)); for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { // L_foo$stub: @@ -1643,6 +1723,106 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { return AsmPrinter::doFinalization(M); } +void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) { + // Get the function descriptor symbol. + CurrentFnDescSym = getSymbol(&MF.getFunction()); + // Set the containing csect. + MCSectionXCOFF *FnDescSec = OutStreamer->getContext().getXCOFFSection( + CurrentFnDescSym->getName(), XCOFF::XMC_DS, XCOFF::XTY_SD, + XCOFF::C_HIDEXT, SectionKind::getData()); + cast<MCSymbolXCOFF>(CurrentFnDescSym)->setContainingCsect(FnDescSec); + + return AsmPrinter::SetupMachineFunction(MF); +} + +void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + // Early error checking limiting what is supported. + if (GV->isThreadLocal()) + report_fatal_error("Thread local not yet supported on AIX."); + + if (GV->hasSection()) + report_fatal_error("Custom section for Data not yet supported."); + + if (GV->hasComdat()) + report_fatal_error("COMDAT not yet supported by AIX."); + + SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM); + if (!GVKind.isCommon() && !GVKind.isBSSLocal() && !GVKind.isData()) + report_fatal_error("Encountered a global variable kind that is " + "not supported yet."); + + // Create the containing csect and switch to it. + MCSectionXCOFF *CSect = cast<MCSectionXCOFF>( + getObjFileLowering().SectionForGlobal(GV, GVKind, TM)); + OutStreamer->SwitchSection(CSect); + + // Create the symbol, set its storage class, and emit it. + MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV)); + GVSym->setStorageClass( + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV)); + GVSym->setContainingCsect(CSect); + + const DataLayout &DL = GV->getParent()->getDataLayout(); + + // Handle common symbols. + if (GVKind.isCommon() || GVKind.isBSSLocal()) { + unsigned Align = + GV->getAlignment() ? GV->getAlignment() : DL.getPreferredAlignment(GV); + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + + if (GVKind.isBSSLocal()) + OutStreamer->EmitXCOFFLocalCommonSymbol(GVSym, Size, Align); + else + OutStreamer->EmitCommonSymbol(GVSym, Size, Align); + return; + } + + MCSymbol *EmittedInitSym = GVSym; + EmitLinkage(GV, EmittedInitSym); + EmitAlignment(getGVAlignment(GV, DL), GV); + OutStreamer->EmitLabel(EmittedInitSym); + EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); +} + +void PPCAIXAsmPrinter::EmitFunctionDescriptor() { + const DataLayout &DL = getDataLayout(); + const unsigned PointerSize = DL.getPointerSizeInBits() == 64 ? 8 : 4; + + MCSectionSubPair Current = OutStreamer->getCurrentSection(); + // Emit function descriptor. + OutStreamer->SwitchSection( + cast<MCSymbolXCOFF>(CurrentFnDescSym)->getContainingCsect()); + OutStreamer->EmitLabel(CurrentFnDescSym); + // Emit function entry point address. + OutStreamer->EmitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext), + PointerSize); + // Emit TOC base address. + MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]")); + OutStreamer->EmitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext), + PointerSize); + // Emit a null environment pointer. + OutStreamer->EmitIntValue(0, PointerSize); + + OutStreamer->SwitchSection(Current.first, Current.second); +} + +void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) { + // If there are no functions in this module, we will never need to reference + // the TOC base. + if (M.empty()) + return; + + // Emit TOC base. + MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]")); + MCSectionXCOFF *TOCBaseSection = OutStreamer->getContext().getXCOFFSection( + StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT, + SectionKind::getData()); + cast<MCSymbolXCOFF>(TOCBaseSym)->setContainingCsect(TOCBaseSection); + // Switch to section to emit TOC base. + OutStreamer->SwitchSection(TOCBaseSection); +} + + /// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code /// for a MachineFunction to the given output stream, in a format that the /// Darwin assembler can deal with. diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp index 5e9a661f8f0b..d325b078979f 100644 --- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -340,9 +340,10 @@ bool PPCBranchCoalescing::identicalOperands( if (Op1.isIdenticalTo(Op2)) { // filter out instructions with physical-register uses - if (Op1.isReg() && TargetRegisterInfo::isPhysicalRegister(Op1.getReg()) - // If the physical register is constant then we can assume the value - // has not changed between uses. + if (Op1.isReg() && + Register::isPhysicalRegister(Op1.getReg()) + // If the physical register is constant then we can assume the value + // has not changed between uses. && !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) { LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n"); return false; @@ -355,8 +356,8 @@ bool PPCBranchCoalescing::identicalOperands( // definition of the register produces the same value. If they produce the // same value, consider them to be identical. if (Op1.isReg() && Op2.isReg() && - TargetRegisterInfo::isVirtualRegister(Op1.getReg()) && - TargetRegisterInfo::isVirtualRegister(Op2.getReg())) { + Register::isVirtualRegister(Op1.getReg()) && + Register::isVirtualRegister(Op2.getReg())) { MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg()); MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg()); if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) { @@ -456,7 +457,7 @@ bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI, << TargetMBB.getNumber() << "\n"); for (auto &Use : MI.uses()) { - if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) { + if (Use.isReg() && Register::isVirtualRegister(Use.getReg())) { MachineInstr *DefInst = MRI->getVRegDef(Use.getReg()); if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) { LLVM_DEBUG(dbgs() << " *** Cannot move this instruction ***\n"); diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp index 793d690baec3..cdff4d383d23 100644 --- a/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -81,21 +81,20 @@ FunctionPass *llvm::createPPCBranchSelectionPass() { /// original Offset. unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB, unsigned Offset) { - unsigned Align = MBB.getAlignment(); - if (!Align) + const Align Alignment = MBB.getAlignment(); + if (Alignment == Align::None()) return 0; - unsigned AlignAmt = 1 << Align; - unsigned ParentAlign = MBB.getParent()->getAlignment(); + const Align ParentAlign = MBB.getParent()->getAlignment(); - if (Align <= ParentAlign) - return OffsetToAlignment(Offset, AlignAmt); + if (Alignment <= ParentAlign) + return offsetToAlignment(Offset, Alignment); // The alignment of this MBB is larger than the function's alignment, so we // can't tell whether or not it will insert nops. Assume that it will. if (FirstImpreciseBlock < 0) FirstImpreciseBlock = MBB.getNumber(); - return AlignAmt + OffsetToAlignment(Offset, AlignAmt); + return Alignment.value() + offsetToAlignment(Offset, Alignment); } /// We need to be careful about the offset of the first block in the function @@ -179,7 +178,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn, const MachineBasicBlock *Dest, unsigned BrOffset) { int BranchSize; - unsigned MaxAlign = 2; + Align MaxAlign = Align(4); bool NeedExtraAdjustment = false; if (Dest->getNumber() <= Src->getNumber()) { // If this is a backwards branch, the delta is the offset from the @@ -192,8 +191,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn, BranchSize += BlockSizes[DestBlock].first; for (unsigned i = DestBlock+1, e = Src->getNumber(); i < e; ++i) { BranchSize += BlockSizes[i].first; - MaxAlign = std::max(MaxAlign, - Fn.getBlockNumbered(i)->getAlignment()); + MaxAlign = std::max(MaxAlign, Fn.getBlockNumbered(i)->getAlignment()); } NeedExtraAdjustment = (FirstImpreciseBlock >= 0) && @@ -207,8 +205,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn, MaxAlign = std::max(MaxAlign, Dest->getAlignment()); for (unsigned i = StartBlock+1, e = Dest->getNumber(); i != e; ++i) { BranchSize += BlockSizes[i].first; - MaxAlign = std::max(MaxAlign, - Fn.getBlockNumbered(i)->getAlignment()); + MaxAlign = std::max(MaxAlign, Fn.getBlockNumbered(i)->getAlignment()); } NeedExtraAdjustment = (FirstImpreciseBlock >= 0) && @@ -258,7 +255,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn, // The computed offset is at most ((1 << alignment) - 4) bytes smaller // than actual offset. So we add this number to the offset for safety. if (NeedExtraAdjustment) - BranchSize += (1 << MaxAlign) - 4; + BranchSize += MaxAlign.value() - 4; return BranchSize; } @@ -339,16 +336,16 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { // 1. CR register // 2. Target MBB PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); - unsigned CRReg = I->getOperand(1).getReg(); + Register CRReg = I->getOperand(1).getReg(); // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. BuildMI(MBB, I, dl, TII->get(PPC::BCC)) .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); } else if (I->getOpcode() == PPC::BC) { - unsigned CRBit = I->getOperand(0).getReg(); + Register CRBit = I->getOperand(0).getReg(); BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2); } else if (I->getOpcode() == PPC::BCn) { - unsigned CRBit = I->getOperand(0).getReg(); + Register CRBit = I->getOperand(0).getReg(); BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2); } else if (I->getOpcode() == PPC::BDNZ) { BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2); diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 264d6b590f95..d8425d89da92 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -162,7 +162,7 @@ class PPCFastISel final : public FastISel { bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value, bool isZExt, unsigned DestReg, const PPC::Predicate Pred); - bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + bool PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr, const TargetRegisterClass *RC, bool IsZExt = true, unsigned FP64LoadOpc = PPC::LFD); bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr); @@ -451,7 +451,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset, // Emit a load instruction if possible, returning true if we succeeded, // otherwise false. See commentary below for how the register class of // the load is determined. -bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, +bool PPCFastISel::PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr, const TargetRegisterClass *RC, bool IsZExt, unsigned FP64LoadOpc) { unsigned Opc; @@ -469,7 +469,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, (ResultReg ? MRI.getRegClass(ResultReg) : (RC ? RC : (VT == MVT::f64 ? (HasSPE ? &PPC::SPERCRegClass : &PPC::F8RCRegClass) : - (VT == MVT::f32 ? (HasSPE ? &PPC::SPE4RCRegClass : &PPC::F4RCRegClass) : + (VT == MVT::f32 ? (HasSPE ? &PPC::GPRCRegClass : &PPC::F4RCRegClass) : (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass : &PPC::GPRC_and_GPRC_NOR0RegClass))))); @@ -612,7 +612,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) { const TargetRegisterClass *RC = AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; - unsigned ResultReg = 0; + Register ResultReg = 0; if (!PPCEmitLoad(VT, ResultReg, Addr, RC, true, PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD)) return false; @@ -989,7 +989,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) { unsigned DestReg; auto RC = MRI.getRegClass(SrcReg); if (PPCSubTarget->hasSPE()) { - DestReg = createResultReg(&PPC::SPE4RCRegClass); + DestReg = createResultReg(&PPC::GPRCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::EFSCFD), DestReg) .addReg(SrcReg); @@ -1051,7 +1051,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg, } const TargetRegisterClass *RC = &PPC::F8RCRegClass; - unsigned ResultReg = 0; + Register ResultReg = 0; if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc)) return 0; @@ -1176,7 +1176,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT, const TargetRegisterClass *RC = AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; - unsigned ResultReg = 0; + Register ResultReg = 0; if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned)) return 0; @@ -1229,9 +1229,9 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { if (PPCSubTarget->hasSPE()) { DestReg = createResultReg(&PPC::GPRCRegClass); if (IsSigned) - Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ; + Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ; else - Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ; + Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ; } else if (isVSFRCRegClass(RC)) { DestReg = createResultReg(&PPC::VSFRCRegClass); if (DstVT == MVT::i32) @@ -1717,7 +1717,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) { CCValAssign &VA = ValLocs[0]; - unsigned RetReg = VA.getLocReg(); + Register RetReg = VA.getLocReg(); // We still need to worry about properly extending the sign. For example, // we could have only a single bit or a constant that needs zero // extension rather than sign extension. Make sure we pass the return @@ -2002,7 +2002,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { const bool HasSPE = PPCSubTarget->hasSPE(); const TargetRegisterClass *RC; if (HasSPE) - RC = ((VT == MVT::f32) ? &PPC::SPE4RCRegClass : &PPC::SPERCRegClass); + RC = ((VT == MVT::f32) ? &PPC::GPRCRegClass : &PPC::SPERCRegClass); else RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass); @@ -2031,8 +2031,8 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) .addImm(0).addReg(TmpReg).addMemOperand(MMO); } else { - // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)). - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), + // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA8(X2, Idx)). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8), TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx); // But for large code model, we must generate a LDtocL followed // by the LF[SD]. @@ -2085,16 +2085,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { // or externally available linkage, a non-local function address, or a // jump table address (not yet needed), or if we are generating code // for large code model, we generate: - // LDtocL(GV, ADDIStocHA(%x2, GV)) + // LDtocL(GV, ADDIStocHA8(%x2, GV)) // Otherwise we generate: - // ADDItocL(ADDIStocHA(%x2, GV), GV) - // Either way, start with the ADDIStocHA: + // ADDItocL(ADDIStocHA8(%x2, GV), GV) + // Either way, start with the ADDIStocHA8: unsigned HighPartReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8), HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); - unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); - if (GVFlags & PPCII::MO_NLP_FLAG) { + if (PPCSubTarget->isGVIndirectSymbol(GV)) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), DestReg).addGlobalAddress(GV).addReg(HighPartReg); } else { @@ -2353,7 +2352,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, if (!PPCComputeAddress(LI->getOperand(0), Addr)) return false; - unsigned ResultReg = MI->getOperand(0).getReg(); + Register ResultReg = MI->getOperand(0).getReg(); if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt, PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD)) @@ -2464,7 +2463,7 @@ namespace llvm { const TargetLibraryInfo *LibInfo) { // Only available on 64-bit ELF for now. const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>(); - if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) + if (Subtarget.is64BitELFABI()) return new PPCFastISel(FuncInfo, LibInfo); return nullptr; } diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index ebfb1ef7f49b..06a4d183e781 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -47,13 +47,15 @@ static const MCPhysReg VRRegNo[] = { }; static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) { - if (STI.isDarwinABI()) + if (STI.isDarwinABI() || STI.isAIXABI()) return STI.isPPC64() ? 16 : 8; // SVR4 ABI: return STI.isPPC64() ? 16 : 4; } static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) { + if (STI.isAIXABI()) + return STI.isPPC64() ? 40 : 20; return STI.isELFv2ABI() ? 24 : 40; } @@ -88,6 +90,11 @@ static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) { : STI.getTargetMachine().isPositionIndependent() ? -12U : -8U; } +static unsigned computeCRSaveOffset() { + // The condition register save offset needs to be updated for AIX PPC32. + return 8; +} + PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI) : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, STI.getPlatformStackAlignment(), 0), @@ -95,7 +102,8 @@ PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI) TOCSaveOffset(computeTOCSaveOffset(Subtarget)), FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)), LinkageSize(computeLinkageSize(Subtarget)), - BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {} + BasePointerSaveOffset(computeBasePointerSaveOffset(Subtarget)), + CRSaveOffset(computeCRSaveOffset()) {} // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack. const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( @@ -370,8 +378,8 @@ static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) { return; } - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if ((UsedRegMask & 0xFFFF) == UsedRegMask) { if (DstReg != SrcReg) @@ -781,15 +789,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, bool isPPC64 = Subtarget.isPPC64(); // Get the ABI. bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isAIXABI = Subtarget.isAIXABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); - assert((Subtarget.isDarwinABI() || isSVR4ABI) && - "Currently only Darwin and SVR4 ABIs are supported for PowerPC."); + assert((Subtarget.isDarwinABI() || isSVR4ABI || isAIXABI) && + "Unsupported PPC ABI."); // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it, // process it. if (!isSVR4ABI) for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) { if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) { + if (isAIXABI) + report_fatal_error("UPDATE_VRSAVE is unexpected on AIX."); HandleVRSaveUpdate(*MBBI, TII); break; } @@ -819,7 +830,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, bool HasRedZone = isPPC64 || !isSVR4ABI; unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; - unsigned BPReg = RegInfo->getBaseRegister(MF); + Register BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2; @@ -908,6 +919,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, assert((isPPC64 || !MustSaveCR) && "Prologue CR saving supported only in 64-bit mode"); + if (MustSaveCR && isAIXABI) + report_fatal_error("Prologue CR saving is unimplemented on AIX."); + // Check if we can move the stack update instruction (stdu) down the prologue // past the callee saves. Hopefully this will avoid the situation where the // saves are waiting for the update on the store with update to complete. @@ -966,7 +980,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, MIB.addReg(MustSaveCRs[i], CrState); BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8)) .addReg(TempReg, getKillRegState(true)) - .addImm(8) + .addImm(getCRSaveOffset()) .addReg(SPReg); } @@ -1020,7 +1034,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, assert(HasRedZone && "A red zone is always available on PPC64"); BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8)) .addReg(TempReg, getKillRegState(true)) - .addImm(8) + .addImm(getCRSaveOffset()) .addReg(SPReg); } @@ -1324,7 +1338,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // actually saved gets its own CFI record. unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2; unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(CRReg, true), 8)); + nullptr, MRI->getDwarfRegNum(CRReg, true), getCRSaveOffset())); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); continue; @@ -1387,7 +1401,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI(); unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; - unsigned BPReg = RegInfo->getBaseRegister(MF); + Register BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg @@ -1590,7 +1604,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // is live here. assert(HasRedZone && "Expecting red zone"); BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg) - .addImm(8) + .addImm(getCRSaveOffset()) .addReg(SPReg); for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i]) @@ -1614,7 +1628,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, assert(isPPC64 && "Expecting 64-bit mode"); assert(RBReg == SPReg && "Should be using SP as a base register"); BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg) - .addImm(8) + .addImm(getCRSaveOffset()) .addReg(RBReg); } @@ -1762,8 +1776,8 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // Save R31 if necessary int FPSI = FI->getFramePointerSaveIndex(); - bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); + const bool isPPC64 = Subtarget.isPPC64(); + const bool IsDarwinABI = Subtarget.isDarwinABI(); MachineFrameInfo &MFI = MF.getFrameInfo(); // If the frame pointer save index hasn't been defined yet. @@ -1812,7 +1826,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the // function uses CR 2, 3, or 4. - if (!isPPC64 && !isDarwinABI && + if (!isPPC64 && !IsDarwinABI && (SavedRegs.test(PPC::CR2) || SavedRegs.test(PPC::CR3) || SavedRegs.test(PPC::CR4))) { @@ -1872,8 +1886,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() || (Reg != PPC::X2 && Reg != PPC::R2)) && "Not expecting to try to spill R2 in a function that must save TOC"); - if (PPC::GPRCRegClass.contains(Reg) || - PPC::SPE4RCRegClass.contains(Reg)) { + if (PPC::GPRCRegClass.contains(Reg)) { HasGPSaveArea = true; GPRegs.push_back(CSI[i]); @@ -1967,7 +1980,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, assert(FI && "No Base Pointer Save Slot!"); MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); - unsigned BP = RegInfo->getBaseRegister(MF); + Register BP = RegInfo->getBaseRegister(MF); if (PPC::G8RCRegClass.contains(BP)) { MinG8R = std::min<unsigned>(MinG8R, BP); HasG8SaveArea = true; @@ -2428,6 +2441,26 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } +unsigned PPCFrameLowering::getTOCSaveOffset() const { + if (Subtarget.isAIXABI()) + // TOC save/restore is normally handled by the linker. + // Indirect calls should hit this limitation. + report_fatal_error("TOC save is not implemented on AIX yet."); + return TOCSaveOffset; +} + +unsigned PPCFrameLowering::getFramePointerSaveOffset() const { + if (Subtarget.isAIXABI()) + report_fatal_error("FramePointer is not implemented on AIX yet."); + return FramePointerSaveOffset; +} + +unsigned PPCFrameLowering::getBasePointerSaveOffset() const { + if (Subtarget.isAIXABI()) + report_fatal_error("BasePointer is not implemented on AIX yet."); + return BasePointerSaveOffset; +} + bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { if (MF.getInfo<PPCFunctionInfo>()->shrinkWrapDisabled()) return false; diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index d116e9fd22e1..a5fbc9acbb28 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -26,6 +26,7 @@ class PPCFrameLowering: public TargetFrameLowering { const unsigned FramePointerSaveOffset; const unsigned LinkageSize; const unsigned BasePointerSaveOffset; + const unsigned CRSaveOffset; /** * Find register[s] that can be used in function prologue and epilogue @@ -142,15 +143,19 @@ public: /// getTOCSaveOffset - Return the previous frame offset to save the /// TOC register -- 64-bit SVR4 ABI only. - unsigned getTOCSaveOffset() const { return TOCSaveOffset; } + unsigned getTOCSaveOffset() const; /// getFramePointerSaveOffset - Return the previous frame offset to save the /// frame pointer. - unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; } + unsigned getFramePointerSaveOffset() const; /// getBasePointerSaveOffset - Return the previous frame offset to save the /// base pointer. - unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; } + unsigned getBasePointerSaveOffset() const; + + /// getCRSaveOffset - Return the previous frame offset to save the + /// CR register. + unsigned getCRSaveOffset() const { return CRSaveOffset; } /// getLinkageSize - Return the size of the PowerPC ABI linkage area. /// diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 543cac075f55..4ad6c88233fe 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -371,7 +371,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { // by the scheduler. Detect them now. bool HasVectorVReg = false; for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + unsigned Reg = Register::index2VirtReg(i); if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) { HasVectorVReg = true; break; @@ -391,8 +391,8 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { // Create two vregs - one to hold the VRSAVE register that is live-in to the // function and one for the value after having bits or'd into it. - unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); - unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo(); MachineBasicBlock &EntryBB = *Fn.begin(); @@ -447,7 +447,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { } else { BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); - unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + Register TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::UpdateGBR), GlobalBaseReg) .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg); @@ -5065,52 +5065,95 @@ void PPCDAGToDAGISel::Select(SDNode *N) { return; } case PPCISD::TOC_ENTRY: { - assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) && - "Only supported for 64-bit ABI and 32-bit SVR4"); - if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) { - SDValue GA = N->getOperand(0); - SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA, - N->getOperand(1)); - transferMemOperands(N, MN); - ReplaceNode(N, MN); - return; - } + const bool isPPC64 = PPCSubTarget->isPPC64(); + const bool isELFABI = PPCSubTarget->isSVR4ABI(); + const bool isAIXABI = PPCSubTarget->isAIXABI(); + + assert(!PPCSubTarget->isDarwin() && "TOC is an ELF/XCOFF construct"); + + // PowerPC only support small, medium and large code model. + const CodeModel::Model CModel = TM.getCodeModel(); + assert(!(CModel == CodeModel::Tiny || CModel == CodeModel::Kernel) && + "PowerPC doesn't support tiny or kernel code models."); - // For medium and large code model, we generate two instructions as - // described below. Otherwise we allow SelectCodeCommon to handle this, + if (isAIXABI && CModel == CodeModel::Medium) + report_fatal_error("Medium code model is not supported on AIX."); + + // For 64-bit small code model, we allow SelectCodeCommon to handle this, // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA. - CodeModel::Model CModel = TM.getCodeModel(); - if (CModel != CodeModel::Medium && CModel != CodeModel::Large) + if (isPPC64 && CModel == CodeModel::Small) break; - // The first source operand is a TargetGlobalAddress or a TargetJumpTable. - // If it must be toc-referenced according to PPCSubTarget, we generate: - // LDtocL(@sym, ADDIStocHA(%x2, @sym)) + // Handle 32-bit small code model. + if (!isPPC64) { + // Transforms the ISD::TOC_ENTRY node to a PPCISD::LWZtoc. + auto replaceWithLWZtoc = [this, &dl](SDNode *TocEntry) { + SDValue GA = TocEntry->getOperand(0); + SDValue TocBase = TocEntry->getOperand(1); + SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA, + TocBase); + transferMemOperands(TocEntry, MN); + ReplaceNode(TocEntry, MN); + }; + + if (isELFABI) { + assert(TM.isPositionIndependent() && + "32-bit ELF can only have TOC entries in position independent" + " code."); + // 32-bit ELF always uses a small code model toc access. + replaceWithLWZtoc(N); + return; + } + + if (isAIXABI && CModel == CodeModel::Small) { + replaceWithLWZtoc(N); + return; + } + } + + assert(CModel != CodeModel::Small && "All small code models handled."); + + assert((isPPC64 || (isAIXABI && !isPPC64)) && "We are dealing with 64-bit" + " ELF/AIX or 32-bit AIX in the following."); + + // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode + // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We + // generate two instructions as described below. The first source operand + // is a symbol reference. If it must be toc-referenced according to + // PPCSubTarget, we generate: + // [32-bit AIX] + // LWZtocL(@sym, ADDIStocHA(%r2, @sym)) + // [64-bit ELF/AIX] + // LDtocL(@sym, ADDIStocHA8(%x2, @sym)) // Otherwise we generate: - // ADDItocL(ADDIStocHA(%x2, @sym), @sym) + // ADDItocL(ADDIStocHA8(%x2, @sym), @sym) SDValue GA = N->getOperand(0); SDValue TOCbase = N->getOperand(1); - SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64, - TOCbase, GA); + + EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + SDNode *Tmp = CurDAG->getMachineNode( + isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA); + if (PPCLowering->isAccessedAsGotIndirect(GA)) { - // If it is access as got-indirect, we need an extra LD to load + // If it is accessed as got-indirect, we need an extra LWZ/LD to load // the address. - SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, - SDValue(Tmp, 0)); + SDNode *MN = CurDAG->getMachineNode( + isPPC64 ? PPC::LDtocL : PPC::LWZtocL, dl, VT, GA, SDValue(Tmp, 0)); + transferMemOperands(N, MN); ReplaceNode(N, MN); return; } - // Build the address relative to the TOC-pointer.. + // Build the address relative to the TOC-pointer. ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, SDValue(Tmp, 0), GA)); return; } case PPCISD::PPC32_PICGOT: // Generate a PIC-safe GOT reference. - assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() && - "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); + assert(PPCSubTarget->is32BitELFABI() && + "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::i32); @@ -6456,7 +6499,7 @@ void PPCDAGToDAGISel::PeepholePPC64() { continue; if (!HBase.isMachineOpcode() || - HBase.getMachineOpcode() != PPC::ADDIStocHA) + HBase.getMachineOpcode() != PPC::ADDIStocHA8) continue; if (!Base.hasOneUse() || !HBase.hasOneUse()) diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 24d50074860d..8cf6a660b08b 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -139,13 +139,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. bool isPPC64 = Subtarget.isPPC64(); - setMinStackArgumentAlignment(isPPC64 ? 8:4); + setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4)); // Set up the register classes. addRegisterClass(MVT::i32, &PPC::GPRCRegClass); if (!useSoftFloat()) { if (hasSPE()) { - addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass); + addRegisterClass(MVT::f32, &PPC::GPRCRegClass); addRegisterClass(MVT::f64, &PPC::SPERCRegClass); } else { addRegisterClass(MVT::f32, &PPC::F4RCRegClass); @@ -431,28 +431,26 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); - if (Subtarget.isSVR4ABI()) { - if (isPPC64) { - // VAARG always uses double-word chunks, so promote anything smaller. - setOperationAction(ISD::VAARG, MVT::i1, Promote); - AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); - setOperationAction(ISD::VAARG, MVT::i8, Promote); - AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); - setOperationAction(ISD::VAARG, MVT::i16, Promote); - AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); - setOperationAction(ISD::VAARG, MVT::i32, Promote); - AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); - setOperationAction(ISD::VAARG, MVT::Other, Expand); - } else { - // VAARG is custom lowered with the 32-bit SVR4 ABI. - setOperationAction(ISD::VAARG, MVT::Other, Custom); - setOperationAction(ISD::VAARG, MVT::i64, Custom); - } + if (Subtarget.is64BitELFABI()) { + // VAARG always uses double-word chunks, so promote anything smaller. + setOperationAction(ISD::VAARG, MVT::i1, Promote); + AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i8, Promote); + AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i16, Promote); + AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i32, Promote); + AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + } else if (Subtarget.is32BitELFABI()) { + // VAARG is custom lowered with the 32-bit SVR4 ABI. + setOperationAction(ISD::VAARG, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::i64, Custom); } else setOperationAction(ISD::VAARG, MVT::Other, Expand); - if (Subtarget.isSVR4ABI() && !isPPC64) - // VACOPY is custom lowered with the 32-bit SVR4 ABI. + // VACOPY is custom lowered with the 32-bit SVR4 ABI. + if (Subtarget.is32BitELFABI()) setOperationAction(ISD::VACOPY , MVT::Other, Custom); else setOperationAction(ISD::VACOPY , MVT::Other, Expand); @@ -553,17 +551,25 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); // For v2i64, these are only valid with P8Vector. This is corrected after // the loop. - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); + if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) { + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + } + else { + setOperationAction(ISD::SMAX, VT, Expand); + setOperationAction(ISD::SMIN, VT, Expand); + setOperationAction(ISD::UMAX, VT, Expand); + setOperationAction(ISD::UMIN, VT, Expand); + } if (Subtarget.hasVSX()) { setOperationAction(ISD::FMAXNUM, VT, Legal); @@ -646,7 +652,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); @@ -944,7 +950,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); - setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); setOperationAction(ISD::FNEG , MVT::v4f64, Legal); @@ -1118,6 +1123,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); @@ -1172,9 +1179,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setJumpIsExpensive(); } - setMinFunctionAlignment(2); + setMinFunctionAlignment(Align(4)); if (Subtarget.isDarwin()) - setPrefFunctionAlignment(4); + setPrefFunctionAlignment(Align(16)); switch (Subtarget.getDarwinDirective()) { default: break; @@ -1191,8 +1198,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, case PPC::DIR_PWR7: case PPC::DIR_PWR8: case PPC::DIR_PWR9: - setPrefFunctionAlignment(4); - setPrefLoopAlignment(4); + setPrefLoopAlignment(Align(16)); + setPrefFunctionAlignment(Align(16)); break; } @@ -1352,6 +1359,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; + case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE"; + case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE"; case PPCISD::ST_VSR_SCAL_INT: return "PPCISD::ST_VSR_SCAL_INT"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -1396,7 +1405,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; - case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH"; + case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF"; + case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; } return nullptr; } @@ -1517,7 +1527,7 @@ bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { const PPCSubtarget& Subtarget = - static_cast<const PPCSubtarget&>(DAG.getSubtarget()); + static_cast<const PPCSubtarget&>(DAG.getSubtarget()); if (!Subtarget.hasP8Vector()) return false; @@ -1769,10 +1779,10 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a splat of a single element that is suitable for input to -/// VSPLTB/VSPLTH/VSPLTW. +/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.). bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { - assert(N->getValueType(0) == MVT::v16i8 && - (EltSize == 1 || EltSize == 2 || EltSize == 4)); + assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) && + EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes"); // The consecutive indices need to specify an element, not part of two // different elements. So abandon ship early if this isn't the case. @@ -2065,10 +2075,11 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, } -/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the -/// specified isSplatShuffleMask VECTOR_SHUFFLE mask. -unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, - SelectionDAG &DAG) { +/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is +/// appropriate for PPC mnemonics (which have a big endian bias - namely +/// elements are counted from the left of the vector register). +unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); assert(isSplatShuffleMask(SVOp, EltSize)); if (DAG.getDataLayout().isLittleEndian()) @@ -2667,12 +2678,14 @@ static void setUsesTOCBasePtr(SelectionDAG &DAG) { setUsesTOCBasePtr(DAG.getMachineFunction()); } -static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, - SDValue GA) { +SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, + SDValue GA) const { + const bool Is64Bit = Subtarget.isPPC64(); EVT VT = Is64Bit ? MVT::i64 : MVT::i32; - SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : - DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); - + SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) + : Subtarget.isAIXABI() + ? DAG.getRegister(PPC::R2, VT) + : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, @@ -2688,10 +2701,10 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + if (Subtarget.is64BitELFABI()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); - return getTOCEntry(DAG, SDLoc(CP), true, GA); + return getTOCEntry(DAG, SDLoc(CP), GA); } unsigned MOHiFlag, MOLoFlag; @@ -2701,7 +2714,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), PPCII::MO_PIC_FLAG); - return getTOCEntry(DAG, SDLoc(CP), false, GA); + return getTOCEntry(DAG, SDLoc(CP), GA); } SDValue CPIHi = @@ -2764,10 +2777,10 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + if (Subtarget.is64BitELFABI()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); - return getTOCEntry(DAG, SDLoc(JT), true, GA); + return getTOCEntry(DAG, SDLoc(JT), GA); } unsigned MOHiFlag, MOLoFlag; @@ -2777,7 +2790,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, PPCII::MO_PIC_FLAG); - return getTOCEntry(DAG, SDLoc(GA), false, GA); + return getTOCEntry(DAG, SDLoc(GA), GA); } SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); @@ -2793,14 +2806,18 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual BlockAddress is stored in the TOC. - if (Subtarget.isSVR4ABI() && - (Subtarget.isPPC64() || isPositionIndependent())) { - if (Subtarget.isPPC64()) - setUsesTOCBasePtr(DAG); + if (Subtarget.is64BitELFABI()) { + setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); - return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA); + return getTOCEntry(DAG, SDLoc(BASDN), GA); } + // 32-bit position-independent ELF stores the BlockAddress in the .got. + if (Subtarget.is32BitELFABI() && isPositionIndependent()) + return getTOCEntry( + DAG, SDLoc(BASDN), + DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset())); + unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); @@ -2913,12 +2930,12 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SDLoc DL(GSDN); const GlobalValue *GV = GSDN->getGlobal(); - // 64-bit SVR4 ABI code is always position-independent. + // 64-bit SVR4 ABI & AIX ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); - return getTOCEntry(DAG, DL, true, GA); + return getTOCEntry(DAG, DL, GA); } unsigned MOHiFlag, MOLoFlag; @@ -2929,7 +2946,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), PPCII::MO_PIC_FLAG); - return getTOCEntry(DAG, DL, false, GA); + return getTOCEntry(DAG, DL, GA); } SDValue GAHi = @@ -3235,8 +3252,8 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV, nextOffset)); } -/// FPR - The set of FP registers that should be allocated for arguments, -/// on Darwin. +/// FPR - The set of FP registers that should be allocated for arguments +/// on Darwin and AIX. static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; @@ -3377,17 +3394,17 @@ SDValue PPCTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - if (Subtarget.isSVR4ABI()) { - if (Subtarget.isPPC64()) - return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, - dl, DAG, InVals); - else - return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, - dl, DAG, InVals); - } else { - return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, - dl, DAG, InVals); - } + if (Subtarget.is64BitELFABI()) + return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, + InVals); + else if (Subtarget.is32BitELFABI()) + return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, + InVals); + + // FIXME: We are using this for both AIX and Darwin. We should add appropriate + // AIX testing, and rename it appropriately. + return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG, + InVals); } SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( @@ -3467,7 +3484,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( if (Subtarget.hasP8Vector()) RC = &PPC::VSSRCRegClass; else if (Subtarget.hasSPE()) - RC = &PPC::SPE4RCRegClass; + RC = &PPC::GPRCRegClass; else RC = &PPC::F4RCRegClass; break; @@ -4516,7 +4533,7 @@ callsShareTOCBase(const Function *Caller, SDValue Callee, static bool needStackSlotPassParameters(const PPCSubtarget &Subtarget, const SmallVectorImpl<ISD::OutputArg> &Outs) { - assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); + assert(Subtarget.is64BitELFABI()); const unsigned PtrByteSize = 8; const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); @@ -4926,7 +4943,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, ImmutableCallSite CS, const PPCSubtarget &Subtarget) { bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); - bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool is64BitELFv1ABI = isPPC64 && isSVR4ABI && !Subtarget.isELFv2ABI(); bool isAIXABI = Subtarget.isAIXABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); @@ -4997,7 +5014,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; - if (isSVR4ABI && isPPC64 && !isELFv2ABI) { + if (is64BitELFv1ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). @@ -5085,7 +5102,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) - if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) + if (is64BitELFv1ABI && !hasNest) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) @@ -6730,8 +6747,12 @@ SDValue PPCTargetLowering::LowerCall_AIX( const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64) : array_lengthof(GPR_32); + const unsigned NumFPRs = array_lengthof(FPR); + assert(NumFPRs == 13 && "Only FPR 1-13 could be used for parameter passing " + "on AIX"); + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; - unsigned GPR_idx = 0; + unsigned GPR_idx = 0, FPR_idx = 0; SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; @@ -6768,6 +6789,20 @@ SDValue PPCTargetLowering::LowerCall_AIX( break; case MVT::f32: case MVT::f64: + if (FPR_idx != NumFPRs) { + RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + + // If we have any FPRs remaining, we may also have GPRs remaining. + // Args passed in FPRs consume 1 or 2 (f64 in 32 bit mode) available + // GPRs. + if (GPR_idx != NumGPRs) + ++GPR_idx; + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64) + ++GPR_idx; + } else + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: @@ -8152,6 +8187,18 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } +static const SDValue *getNormalLoadInput(const SDValue &Op) { + const SDValue *InputLoad = &Op; + if (InputLoad->getOpcode() == ISD::BITCAST) + InputLoad = &InputLoad->getOperand(0); + if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR) + InputLoad = &InputLoad->getOperand(0); + if (InputLoad->getOpcode() != ISD::LOAD) + return nullptr; + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); + return ISD::isNormalLoad(LD) ? InputLoad : nullptr; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen @@ -8274,6 +8321,34 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || SplatBitSize > 32) { + + const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + // Handle load-and-splat patterns as we have instructions that will do this + // in one go. + if (InputLoad && DAG.isSplatValue(Op, true)) { + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); + + // We have handling for 4 and 8 byte elements. + unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits(); + + // Checking for a single use of this load, we have to check for vector + // width (128 bits) / ElementSize uses (since each operand of the + // BUILD_VECTOR is a separate use of the value. + if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) && + ((Subtarget.hasVSX() && ElementSize == 64) || + (Subtarget.hasP9Vector() && ElementSize == 32))) { + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + DAG.getValueType(Op.getValueType()) // VT + }; + return + DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, + DAG.getVTList(Op.getValueType(), MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + } + } + // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be // lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. @@ -8759,6 +8834,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned ShiftElts, InsertAtByte; bool Swap = false; + + // If this is a load-and-splat, we can do that with a single instruction + // in some cases. However if the load has multiple uses, we don't want to + // combine it because that will just produce multiple loads. + const SDValue *InputLoad = getNormalLoadInput(V1); + if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && + (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && + InputLoad->hasOneUse()) { + bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4); + int SplatIdx = + PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); + // For 4-byte load-and-splat, we need Power9. + if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { + uint64_t Offset = 0; + if (IsFourByte) + Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4; + else + Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; + SDValue BasePtr = LD->getBasePtr(); + if (Offset != 0) + BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + BasePtr, DAG.getIntPtrConstant(Offset, dl)); + SDValue Ops[] = { + LD->getChain(), // Chain + BasePtr, // BasePtr + DAG.getValueType(Op.getValueType()) // VT + }; + SDVTList VTL = + DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other); + SDValue LdSplt = + DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL, + Ops, LD->getMemoryVT(), LD->getMemOperand()); + if (LdSplt.getValueType() != SVOp->getValueType(0)) + LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt); + return LdSplt; + } + } if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { @@ -8835,7 +8949,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { - int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); + int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG); SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, @@ -9880,6 +9994,30 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { switch (Op0.getOpcode()) { default: return SDValue(); + case ISD::EXTRACT_SUBVECTOR: { + assert(Op0.getNumOperands() == 2 && + isa<ConstantSDNode>(Op0->getOperand(1)) && + "Node should have 2 operands with second one being a constant!"); + + if (Op0.getOperand(0).getValueType() != MVT::v4f32) + return SDValue(); + + // Custom lower is only done for high or low doubleword. + int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + if (Idx % 2 != 0) + return SDValue(); + + // Since input is v4f32, at this point Idx is either 0 or 2. + // Shift to get the doubleword position we want. + int DWord = Idx >> 1; + + // High and low word positions are different on little endian. + if (Subtarget.isLittleEndian()) + DWord ^= 0x1; + + return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, + Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32)); + } case ISD::FADD: case ISD::FMUL: case ISD::FSUB: { @@ -9891,26 +10029,25 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { return SDValue(); // Generate new load node. LoadSDNode *LD = cast<LoadSDNode>(LdOp); - SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; - NewLoad[i] = - DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, - DAG.getVTList(MVT::v4f32, MVT::Other), - LoadOps, LD->getMemoryVT(), - LD->getMemOperand()); + SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()}; + NewLoad[i] = DAG.getMemIntrinsicNode( + PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps, + LD->getMemoryVT(), LD->getMemOperand()); } - SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, - NewLoad[0], NewLoad[1], - Op0.getNode()->getFlags()); - return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp); + SDValue NewOp = + DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0], + NewLoad[1], Op0.getNode()->getFlags()); + return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp, + DAG.getConstant(0, dl, MVT::i32)); } case ISD::LOAD: { LoadSDNode *LD = cast<LoadSDNode>(Op0); - SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; - SDValue NewLd = - DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, - DAG.getVTList(MVT::v4f32, MVT::Other), - LoadOps, LD->getMemoryVT(), LD->getMemOperand()); - return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd); + SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()}; + SDValue NewLd = DAG.getMemIntrinsicNode( + PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps, + LD->getMemoryVT(), LD->getMemOperand()); + return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd, + DAG.getConstant(0, dl, MVT::i32)); } } llvm_unreachable("ERROR:Should return for all cases within swtich."); @@ -10048,9 +10185,11 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::TRUNCATE: { EVT TrgVT = N->getValueType(0); + EVT OpVT = N->getOperand(0).getValueType(); if (TrgVT.isVector() && isOperationCustom(N->getOpcode(), TrgVT) && - N->getOperand(0).getValueType().getSizeInBits() <= 128) + OpVT.getSizeInBits() <= 128 && + isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits())) Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); return; } @@ -10192,7 +10331,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, if (CmpOpcode) { // Signed comparisons of byte or halfword values must be sign-extended. if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { - unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); + Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), ExtReg).addReg(dest); BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) @@ -10243,10 +10382,10 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); - unsigned dest = MI.getOperand(0).getReg(); - unsigned ptrA = MI.getOperand(1).getReg(); - unsigned ptrB = MI.getOperand(2).getReg(); - unsigned incr = MI.getOperand(3).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -10364,7 +10503,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( if (CmpOpcode) { // For unsigned comparisons, we can directly compare the shifted values. // For signed comparisons we shift and sign extend. - unsigned SReg = RegInfo.createVirtualRegister(GPRC); + Register SReg = RegInfo.createVirtualRegister(GPRC); BuildMI(BB, dl, TII->get(PPC::AND), SReg) .addReg(TmpDestReg) .addReg(MaskReg); @@ -10375,7 +10514,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) .addReg(SReg) .addReg(ShiftReg); - unsigned ValueSReg = RegInfo.createVirtualRegister(GPRC); + Register ValueSReg = RegInfo.createVirtualRegister(GPRC); BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) .addReg(ValueReg); ValueReg = ValueSReg; @@ -10426,11 +10565,11 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); - unsigned mainDstReg = MRI.createVirtualRegister(RC); - unsigned restoreDstReg = MRI.createVirtualRegister(RC); + Register mainDstReg = MRI.createVirtualRegister(RC); + Register restoreDstReg = MRI.createVirtualRegister(RC); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && @@ -10482,10 +10621,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Prepare IP either in reg. const TargetRegisterClass *PtrRC = getRegClassFor(PVT); - unsigned LabelReg = MRI.createVirtualRegister(PtrRC); - unsigned BufReg = MI.getOperand(1).getReg(); + Register LabelReg = MRI.createVirtualRegister(PtrRC); + Register BufReg = MI.getOperand(1).getReg(); - if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { + if (Subtarget.is64BitELFABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) @@ -10570,7 +10709,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, const TargetRegisterClass *RC = (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - unsigned Tmp = MRI.createVirtualRegister(RC); + Register Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; @@ -10587,7 +10726,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, const int64_t TOCOffset = 3 * PVT.getStoreSize(); const int64_t BPOffset = 4 * PVT.getStoreSize(); - unsigned BufReg = MI.getOperand(0).getReg(); + Register BufReg = MI.getOperand(0).getReg(); // Reload FP (the jumped-to function may not have had a // frame pointer, and if so, then its r31 will be restored @@ -10662,7 +10801,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { if (MI.getOpcode() == TargetOpcode::STACKMAP || MI.getOpcode() == TargetOpcode::PATCHPOINT) { - if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && + if (Subtarget.is64BitELFABI() && MI.getOpcode() == TargetOpcode::PATCHPOINT) { // Call lowering should have added an r2 operand to indicate a dependence // on the TOC base pointer value. It can't however, because there is no @@ -10828,15 +10967,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB = readMBB; MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); - unsigned LoReg = MI.getOperand(0).getReg(); - unsigned HiReg = MI.getOperand(1).getReg(); + Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); + Register LoReg = MI.getOperand(0).getReg(); + Register HiReg = MI.getOperand(1).getReg(); BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); - unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) .addReg(HiReg) @@ -10978,11 +11117,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, StoreMnemonic = PPC::STDCX; break; } - unsigned dest = MI.getOperand(0).getReg(); - unsigned ptrA = MI.getOperand(1).getReg(); - unsigned ptrB = MI.getOperand(2).getReg(); - unsigned oldval = MI.getOperand(3).getReg(); - unsigned newval = MI.getOperand(4).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register oldval = MI.getOperand(3).getReg(); + Register newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -11057,11 +11196,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, bool isLittleEndian = Subtarget.isLittleEndian(); bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; - unsigned dest = MI.getOperand(0).getReg(); - unsigned ptrA = MI.getOperand(1).getReg(); - unsigned ptrB = MI.getOperand(2).getReg(); - unsigned oldval = MI.getOperand(3).getReg(); - unsigned newval = MI.getOperand(4).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register oldval = MI.getOperand(3).getReg(); + Register newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -11238,13 +11377,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // This pseudo performs an FADD with rounding mode temporarily forced // to round-to-zero. We emit this via custom inserter since the FPSCR // is not modeled at the SelectionDAG level. - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Src1 = MI.getOperand(1).getReg(); - unsigned Src2 = MI.getOperand(2).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Src1 = MI.getOperand(1).getReg(); + Register Src2 = MI.getOperand(2).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); // Save FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); @@ -11270,7 +11409,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned Dest = RegInfo.createVirtualRegister( + Register Dest = RegInfo.createVirtualRegister( Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); DebugLoc dl = MI.getDebugLoc(); @@ -11283,7 +11422,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } else if (MI.getOpcode() == PPC::TCHECK_RET) { DebugLoc Dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) @@ -11297,7 +11436,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(PPC::CR0EQ); } else if (MI.getOpcode() == PPC::SETRNDi) { DebugLoc dl = MI.getDebugLoc(); - unsigned OldFPSCRReg = MI.getOperand(0).getReg(); + Register OldFPSCRReg = MI.getOperand(0).getReg(); // Save FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); @@ -11378,7 +11517,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } }; - unsigned OldFPSCRReg = MI.getOperand(0).getReg(); + Register OldFPSCRReg = MI.getOperand(0).getReg(); // Save FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); @@ -11393,12 +11532,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // mtfsf 255, NewFPSCRReg MachineOperand SrcOp = MI.getOperand(1); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg); - unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); - unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); // The first operand of INSERT_SUBREG should be a register which has // subregisters, we only care about its RegClass, so we should use an @@ -11409,14 +11548,14 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .add(SrcOp) .addImm(1); - unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg) .addReg(OldFPSCRTmpReg) .addReg(ExtSrcReg) .addImm(0) .addImm(62); - unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg); // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63 @@ -13113,6 +13252,61 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, return Val; } +SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN, + LSBaseSDNode *LSBase, + DAGCombinerInfo &DCI) const { + assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) && + "Not a reverse memop pattern!"); + + auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool { + auto Mask = SVN->getMask(); + int i = 0; + auto I = Mask.rbegin(); + auto E = Mask.rend(); + + for (; I != E; ++I) { + if (*I != i) + return false; + i++; + } + return true; + }; + + SelectionDAG &DAG = DCI.DAG; + EVT VT = SVN->getValueType(0); + + if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX()) + return SDValue(); + + // Before P9, we have PPCVSXSwapRemoval pass to hack the element order. + // See comment in PPCVSXSwapRemoval.cpp. + // It is conflict with PPCVSXSwapRemoval opt. So we don't do it. + if (!Subtarget.hasP9Vector()) + return SDValue(); + + if(!IsElementReverse(SVN)) + return SDValue(); + + if (LSBase->getOpcode() == ISD::LOAD) { + SDLoc dl(SVN); + SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()}; + return DAG.getMemIntrinsicNode( + PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps, + LSBase->getMemoryVT(), LSBase->getMemOperand()); + } + + if (LSBase->getOpcode() == ISD::STORE) { + SDLoc dl(LSBase); + SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0), + LSBase->getBasePtr()}; + return DAG.getMemIntrinsicNode( + PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps, + LSBase->getMemoryVT(), LSBase->getMemOperand()); + } + + llvm_unreachable("Expected a load or store node here"); +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -13159,6 +13353,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return combineFPToIntToFP(N, DCI); + case ISD::VECTOR_SHUFFLE: + if (ISD::isNormalLoad(N->getOperand(0).getNode())) { + LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0)); + return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI); + } + break; case ISD::STORE: { EVT Op1VT = N->getOperand(1).getValueType(); @@ -13170,6 +13370,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return Val; } + if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) { + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1)); + SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI); + if (Val) + return Val; + } + // Turn STORE (BSWAP) -> sthbrx/stwbrx. if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP && N->getOperand(1).getNode()->hasOneUse() && @@ -13903,7 +14110,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } -unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { +Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { switch (Subtarget.getDarwinDirective()) { default: break; case PPC::DIR_970: @@ -13924,7 +14131,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // Actual alignment of the loop will depend on the hotness check and other // logic in alignBlocks. if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty()) - return 5; + return Align(32); } const PPCInstrInfo *TII = Subtarget.getInstrInfo(); @@ -13940,7 +14147,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { } if (LoopSize > 16 && LoopSize <= 32) - return 5; + return Align(32); break; } @@ -14063,7 +14270,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'f': if (Subtarget.hasSPE()) { if (VT == MVT::f32 || VT == MVT::i32) - return std::make_pair(0U, &PPC::SPE4RCRegClass); + return std::make_pair(0U, &PPC::GPRCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::SPERCRegClass); } else { @@ -14306,22 +14513,22 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { +Register PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); + bool IsDarwinABI = Subtarget.isDarwinABI(); if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || (!isPPC64 && VT != MVT::i32)) report_fatal_error("Invalid register global variable type"); bool is64Bit = isPPC64 && VT == MVT::i64; - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<Register>(RegName) .Case("r1", is64Bit ? PPC::X1 : PPC::R1) - .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) - .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : + .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2) + .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() : (is64Bit ? PPC::X13 : PPC::R13)) - .Default(0); + .Default(Register()); if (Reg) return Reg; @@ -14330,14 +14537,17 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { // 32-bit SVR4 ABI access everything as got-indirect. - if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) + if (Subtarget.is32BitELFABI()) + return true; + + // AIX accesses everything indirectly through the TOC, which is similar to + // the GOT. + if (Subtarget.isAIXABI()) return true; CodeModel::Model CModel = getTargetMachine().getCodeModel(); // If it is small or large code model, module locals are accessed - // indirectly by loading their address from .toc/.got. The difference - // is that for large code model we have ADDISTocHa + LDtocL and for - // small code model we simply have LDtoc. + // indirectly by loading their address from .toc/.got. if (CModel == CodeModel::Small || CModel == CodeModel::Large) return true; @@ -14345,14 +14555,8 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA)) return true; - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { - const GlobalValue *GV = G->getGlobal(); - unsigned char GVFlags = Subtarget.classifyGlobalReference(GV); - // The NLP flag indicates that a global access has to use an - // extra indirection. - if (GVFlags & PPCII::MO_NLP_FLAG) - return true; - } + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) + return Subtarget.isGVIndirectSymbol(G->getGlobal()); return false; } @@ -14417,7 +14621,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; - Info.align = 1; + Info.align = Align::None(); Info.flags = MachineMemOperand::MOLoad; return true; } @@ -14451,7 +14655,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.size = VT.getStoreSize(); - Info.align = 1; + Info.align = Align::None(); Info.flags = MachineMemOperand::MOLoad; return true; } @@ -14503,7 +14707,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(1); Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; - Info.align = 1; + Info.align = Align::None(); Info.flags = MachineMemOperand::MOStore; return true; } @@ -14536,7 +14740,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.size = VT.getStoreSize(); - Info.align = 1; + Info.align = Align::None(); Info.flags = MachineMemOperand::MOStore; return true; } @@ -14786,7 +14990,7 @@ void PPCTargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be @@ -15146,7 +15350,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const { bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // Only duplicate to increase tail-calls for the 64bit SysV ABIs. - if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) + if (!Subtarget.is64BitELFABI()) return false; // If not a tail call then no need to proceed. diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 97422c6eda36..62922ea2d4c4 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -412,8 +412,9 @@ namespace llvm { /// representation. QBFLT, - /// Custom extend v4f32 to v2f64. - FP_EXTEND_LH, + /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or + /// lower (IDX=1) half of v4f32 to v2f64. + FP_EXTEND_HALF, /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of @@ -456,15 +457,29 @@ namespace llvm { /// an xxswapd. LXVD2X, + /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian. + /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on + /// the vector type to load vector in big-endian element order. + LOAD_VEC_BE, + /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a /// v2f32 value into the lower half of a VSR register. LD_VSX_LH, + /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// instructions such as LXVDSX, LXVWSX. + LD_SPLAT, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. STXVD2X, + /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian. + /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on + /// the vector type to store vector in big-endian element order. + STORE_VEC_BE, + /// Store scalar integers from VSR. ST_VSR_SCAL_INT, @@ -563,9 +578,11 @@ namespace llvm { bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE); - /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the - /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. - unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); + /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is + /// appropriate for PPC mnemonics (which have a big endian bias - namely + /// elements are counted from the left of the vector register). + unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, + SelectionDAG &DAG); /// get_VSPLTI_elt - If this is a build_vector of constants which can be /// formed by using a vspltis[bhw] instruction of the specified element @@ -716,8 +733,8 @@ namespace llvm { SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, @@ -725,7 +742,7 @@ namespace llvm { const SelectionDAG &DAG, unsigned Depth = 0) const override; - unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + Align getPrefLoopAlignment(MachineLoop *ML) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override { return true; @@ -834,6 +851,18 @@ namespace llvm { return true; } + bool isDesirableToTransformToIntegerOp(unsigned Opc, + EVT VT) const override { + // Only handle float load/store pair because float(fpr) load/store + // instruction has more cycles than integer(gpr) load/store in PPC. + if (Opc != ISD::LOAD && Opc != ISD::STORE) + return false; + if (VT != MVT::f32 && VT != MVT::f64) + return false; + + return true; + } + // Returns true if the address of the global is stored in TOC entry. bool isAccessedAsGotIndirect(SDValue N) const; @@ -998,6 +1027,8 @@ namespace llvm { SDValue &FPOpOut, const SDLoc &dl) const; + SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, SDValue GA) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; @@ -1155,6 +1186,8 @@ namespace llvm { SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase, + DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index d598567f8e4e..f16187149d36 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1099,8 +1099,8 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src), // Support for medium and large code model. let hasSideEffects = 0 in { let isReMaterializable = 1 in { -def ADDIStocHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), - "#ADDIStocHA", []>, isPPC64; +def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDIStocHA8", []>, isPPC64; def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDItocL", []>, isPPC64; } diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index 8176c5120a83..fd3fc2af2327 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -215,21 +215,21 @@ def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N)); + return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N)); }]>; def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1); }], VSPLTB_get_imm>; def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N)); + return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N)); }]>; def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2); }], VSPLTH_get_imm>; def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N)); + return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N)); }]>; def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ @@ -331,7 +331,7 @@ class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> : VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX), !strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP, - [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>; + [(set Ty:$vD, (IntID Ty:$vA, timm:$ST, timm:$SIX))]>; //===----------------------------------------------------------------------===// // Instruction Definitions. @@ -401,10 +401,10 @@ let isCodeGenOnly = 1 in { def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins), "mfvscr $vD", IIC_LdStStore, - [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; + [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), "mtvscr $vB", IIC_LdStLoad, - [(int_ppc_altivec_mtvscr v4i32:$vB)]>; + [(int_ppc_altivec_mtvscr v4i32:$vB)]>; let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads. def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src), diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index a48eb1690695..96b9c9a119c0 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -1209,20 +1209,13 @@ class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = XT{5}; } -class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, +class XX3Form_SameOp<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> { let XA = XT; let XB = XT; } -class XX3Form_SetZero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list<dag> pattern> - : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> { - let XB = XT; - let XA = XT; -} - class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> : I<opcode, OOL, IOL, asmstr, itin> { diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index a787bdd56b9d..6b10672965c9 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -90,7 +90,6 @@ enum SpillOpcodeKey { SOK_QuadBitSpill, SOK_SpillToVSR, SOK_SPESpill, - SOK_SPE4Spill, SOK_LastOpcodeSpill // This must be last on the enum. }; @@ -184,10 +183,10 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; const MachineOperand &DefMO = DefMI.getOperand(DefIdx); - unsigned Reg = DefMO.getReg(); + Register Reg = DefMO.getReg(); bool IsRegCR; - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { const MachineRegisterInfo *MRI = &DefMI.getParent()->getParent()->getRegInfo(); IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) || @@ -330,11 +329,13 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LIS8: case PPC::QVGPCI: case PPC::ADDIStocHA: + case PPC::ADDIStocHA8: case PPC::ADDItocL: case PPC::LOAD_STACK_GUARD: case PPC::XXLXORz: case PPC::XXLXORspz: case PPC::XXLXORdpz: + case PPC::XXLEQVOnes: case PPC::V_SET0B: case PPC::V_SET0H: case PPC::V_SET0: @@ -448,7 +449,8 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return &MI; } -bool PPCInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, +bool PPCInstrInfo::findCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { // For VSX A-Type FMA instructions, it is the first two operands that can be // commuted, however, because the non-encoded tied input operand is listed @@ -966,11 +968,11 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, getKillRegState(KillSrc); return; } else if (PPC::SPERCRegClass.contains(SrcReg) && - PPC::SPE4RCRegClass.contains(DestReg)) { + PPC::GPRCRegClass.contains(DestReg)) { BuildMI(MBB, I, DL, get(PPC::EFSCFD), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; - } else if (PPC::SPE4RCRegClass.contains(SrcReg) && + } else if (PPC::GPRCRegClass.contains(SrcReg) && PPC::SPERCRegClass.contains(DestReg)) { BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg); getKillRegState(KillSrc); @@ -1009,8 +1011,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opc = PPC::QVFMRb; else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; - else if (PPC::SPE4RCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::OR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) Opc = PPC::EVOR; else @@ -1043,8 +1043,6 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg, OpcodeIndex = SOK_Float4Spill; } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SPESpill; - } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_SPE4Spill; } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_CRSpill; } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { @@ -1083,8 +1081,6 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg, OpcodeIndex = SOK_Float4Spill; } else if (PPC::SPERCRegClass.contains(Reg)) { OpcodeIndex = SOK_SPESpill; - } else if (PPC::SPE4RCRegClass.contains(Reg)) { - OpcodeIndex = SOK_SPE4Spill; } else if (PPC::CRRCRegClass.contains(Reg)) { OpcodeIndex = SOK_CRSpill; } else if (PPC::CRBITRCRegClass.contains(Reg)) { @@ -1133,8 +1129,6 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg, OpcodeIndex = SOK_Float4Spill; } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SPESpill; - } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_SPE4Spill; } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_CRSpill; } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { @@ -1173,8 +1167,6 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg, OpcodeIndex = SOK_Float4Spill; } else if (PPC::SPERCRegClass.contains(Reg)) { OpcodeIndex = SOK_SPESpill; - } else if (PPC::SPE4RCRegClass.contains(Reg)) { - OpcodeIndex = SOK_SPE4Spill; } else if (PPC::CRRCRegClass.contains(Reg)) { OpcodeIndex = SOK_CRSpill; } else if (PPC::CRBITRCRegClass.contains(Reg)) { @@ -1648,7 +1640,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, return false; int OpC = CmpInstr.getOpcode(); - unsigned CRReg = CmpInstr.getOperand(0).getReg(); + Register CRReg = CmpInstr.getOperand(0).getReg(); // FP record forms set CR1 based on the exception status bits, not a // comparison with zero. @@ -1671,7 +1663,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // Look through copies unless that gets us to a physical register. unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI); - if (TargetRegisterInfo::isVirtualRegister(ActualSrc)) + if (Register::isVirtualRegister(ActualSrc)) SrcReg = ActualSrc; // Get the unique definition of SrcReg. @@ -1937,7 +1929,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // Rotates are expensive instructions. If we're emitting a record-form // rotate that can just be an andi/andis, we should just emit that. if (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINM8) { - unsigned GPRRes = MI->getOperand(0).getReg(); + Register GPRRes = MI->getOperand(0).getReg(); int64_t SH = MI->getOperand(2).getImm(); int64_t MB = MI->getOperand(3).getImm(); int64_t ME = MI->getOperand(4).getImm(); @@ -2122,7 +2114,7 @@ bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const { llvm_unreachable("Unknown Operation!"); } - unsigned TargetReg = MI.getOperand(0).getReg(); + Register TargetReg = MI.getOperand(0).getReg(); unsigned Opcode; if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) || (TargetReg >= PPC::VSL0 && TargetReg <= PPC::VSL31)) @@ -2184,7 +2176,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandVSXMemPseudo(MI); } case PPC::SPILLTOVSR_LD: { - unsigned TargetReg = MI.getOperand(0).getReg(); + Register TargetReg = MI.getOperand(0).getReg(); if (PPC::VSFRCRegClass.contains(TargetReg)) { MI.setDesc(get(PPC::DFLOADf64)); return expandPostRAPseudo(MI); @@ -2194,7 +2186,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case PPC::SPILLTOVSR_ST: { - unsigned SrcReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(0).getReg(); if (PPC::VSFRCRegClass.contains(SrcReg)) { NumStoreSPILLVSRRCAsVec++; MI.setDesc(get(PPC::DFSTOREf64)); @@ -2206,7 +2198,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case PPC::SPILLTOVSR_LDX: { - unsigned TargetReg = MI.getOperand(0).getReg(); + Register TargetReg = MI.getOperand(0).getReg(); if (PPC::VSFRCRegClass.contains(TargetReg)) MI.setDesc(get(PPC::LXSDX)); else @@ -2214,7 +2206,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case PPC::SPILLTOVSR_STX: { - unsigned SrcReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(0).getReg(); if (PPC::VSFRCRegClass.contains(SrcReg)) { NumStoreSPILLVSRRCAsVec++; MI.setDesc(get(PPC::STXSDX)); @@ -2279,10 +2271,10 @@ void PPCInstrInfo::replaceInstrOperandWithImm(MachineInstr &MI, int64_t Imm) const { assert(MI.getOperand(OpNo).isReg() && "Operand must be a REG"); // Replace the REG with the Immediate. - unsigned InUseReg = MI.getOperand(OpNo).getReg(); + Register InUseReg = MI.getOperand(OpNo).getReg(); MI.getOperand(OpNo).ChangeToImmediate(Imm); - if (empty(MI.implicit_operands())) + if (MI.implicit_operands().empty()) return; // We need to make sure that the MI didn't have any implicit use @@ -2328,6 +2320,23 @@ void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI, .addImm(LII.Imm); } +MachineInstr *PPCInstrInfo::getDefMIPostRA(unsigned Reg, MachineInstr &MI, + bool &SeenIntermediateUse) const { + assert(!MI.getParent()->getParent()->getRegInfo().isSSA() && + "Should be called after register allocation."); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI; + It++; + SeenIntermediateUse = false; + for (; It != E; ++It) { + if (It->modifiesRegister(Reg, TRI)) + return &*It; + if (It->readsRegister(Reg, TRI)) + SeenIntermediateUse = true; + } + return nullptr; +} + MachineInstr *PPCInstrInfo::getForwardingDefMI( MachineInstr &MI, unsigned &OpNoForForwarding, @@ -2342,11 +2351,11 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( for (int i = 1, e = MI.getNumOperands(); i < e; i++) { if (!MI.getOperand(i).isReg()) continue; - unsigned Reg = MI.getOperand(i).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + Register Reg = MI.getOperand(i).getReg(); + if (!Register::isVirtualRegister(Reg)) continue; unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI); - if (TargetRegisterInfo::isVirtualRegister(TrueReg)) { + if (Register::isVirtualRegister(TrueReg)) { DefMI = MRI->getVRegDef(TrueReg); if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) { OpNoForForwarding = i; @@ -2370,7 +2379,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 || Opc == PPC::RLWINM || Opc == PPC::RLWINMo || Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o; - if (!instrHasImmForm(MI, III, true) && !ConvertibleImmForm) + bool IsVFReg = (MI.getNumOperands() && MI.getOperand(0).isReg()) + ? isVFRegister(MI.getOperand(0).getReg()) + : false; + if (!ConvertibleImmForm && !instrHasImmForm(Opc, IsVFReg, III, true)) return nullptr; // Don't convert or %X, %Y, %Y since that's just a register move. @@ -2381,29 +2393,24 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( MachineOperand &MO = MI.getOperand(i); SeenIntermediateUse = false; if (MO.isReg() && MO.isUse() && !MO.isImplicit()) { - MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI; - It++; - unsigned Reg = MI.getOperand(i).getReg(); - - // Is this register defined by some form of add-immediate (including - // load-immediate) within this basic block? - for ( ; It != E; ++It) { - if (It->modifiesRegister(Reg, &getRegisterInfo())) { - switch (It->getOpcode()) { - default: break; - case PPC::LI: - case PPC::LI8: - case PPC::ADDItocL: - case PPC::ADDI: - case PPC::ADDI8: - OpNoForForwarding = i; - return &*It; - } + Register Reg = MI.getOperand(i).getReg(); + // If we see another use of this reg between the def and the MI, + // we want to flat it so the def isn't deleted. + MachineInstr *DefMI = getDefMIPostRA(Reg, MI, SeenIntermediateUse); + if (DefMI) { + // Is this register defined by some form of add-immediate (including + // load-immediate) within this basic block? + switch (DefMI->getOpcode()) { + default: break; - } else if (It->readsRegister(Reg, &getRegisterInfo())) - // If we see another use of this reg between the def and the MI, - // we want to flat it so the def isn't deleted. - SeenIntermediateUse = true; + case PPC::LI: + case PPC::LI8: + case PPC::ADDItocL: + case PPC::ADDI: + case PPC::ADDI8: + OpNoForForwarding = i; + return DefMI; + } } } } @@ -2417,7 +2424,7 @@ const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const { {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, - PPC::SPILLTOVSR_ST, PPC::EVSTDD, PPC::SPESTW}, + PPC::SPILLTOVSR_ST, PPC::EVSTDD}, // Power 9 {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, @@ -2433,7 +2440,7 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const { {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, - PPC::SPILLTOVSR_LD, PPC::EVLDD, PPC::SPELWZ}, + PPC::SPILLTOVSR_LD, PPC::EVLDD}, // Power 9 {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32, @@ -2538,12 +2545,15 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, "The forwarding operand needs to be valid at this point"); bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill(); bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled; - unsigned ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg(); + Register ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg(); if (KilledDef && KillFwdDefMI) *KilledDef = DefMI; ImmInstrInfo III; - bool HasImmForm = instrHasImmForm(MI, III, PostRA); + bool IsVFReg = MI.getOperand(0).isReg() + ? isVFRegister(MI.getOperand(0).getReg()) + : false; + bool HasImmForm = instrHasImmForm(MI.getOpcode(), IsVFReg, III, PostRA); // If this is a reg+reg instruction that has a reg+imm form, // and one of the operands is produced by an add-immediate, // try to convert it. @@ -2591,7 +2601,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, // If a compare-immediate is fed by an immediate and is itself an input of // an ISEL (the most common case) into a COPY of the correct register. bool Changed = false; - unsigned DefReg = MI.getOperand(0).getReg(); + Register DefReg = MI.getOperand(0).getReg(); int64_t Comparand = MI.getOperand(2).getImm(); int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ? (Comparand | 0xFFFFFFFFFFFF0000) : Comparand; @@ -2601,8 +2611,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8) continue; unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg(); - unsigned TrueReg = CompareUseMI.getOperand(1).getReg(); - unsigned FalseReg = CompareUseMI.getOperand(2).getReg(); + Register TrueReg = CompareUseMI.getOperand(1).getReg(); + Register FalseReg = CompareUseMI.getOperand(2).getReg(); unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg, FalseReg, CRSubReg); if (RegToCopy == PPC::NoRegister) @@ -2777,9 +2787,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, return false; } -bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, +bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, ImmInstrInfo &III, bool PostRA) const { - unsigned Opc = MI.getOpcode(); // The vast majority of the instructions would need their operand 2 replaced // with an immediate when switching to the reg+imm form. A marked exception // are the update form loads/stores for which a constant operand 2 would need @@ -3111,7 +3120,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::LXSSPX: if (PostRA) { - if (isVFRegister(MI.getOperand(0).getReg())) + if (IsVFReg) III.ImmOpcode = PPC::LXSSP; else { III.ImmOpcode = PPC::LFS; @@ -3125,7 +3134,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::LXSDX: if (PostRA) { - if (isVFRegister(MI.getOperand(0).getReg())) + if (IsVFReg) III.ImmOpcode = PPC::LXSD; else { III.ImmOpcode = PPC::LFD; @@ -3143,7 +3152,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::STXSSPX: if (PostRA) { - if (isVFRegister(MI.getOperand(0).getReg())) + if (IsVFReg) III.ImmOpcode = PPC::STXSSP; else { III.ImmOpcode = PPC::STFS; @@ -3157,7 +3166,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, break; case PPC::STXSDX: if (PostRA) { - if (isVFRegister(MI.getOperand(0).getReg())) + if (IsVFReg) III.ImmOpcode = PPC::STXSD; else { III.ImmOpcode = PPC::STFD; @@ -3287,7 +3296,7 @@ bool PPCInstrInfo::isRegElgibleForForwarding( if (MRI.isSSA()) return false; - unsigned Reg = RegMO.getReg(); + Register Reg = RegMO.getReg(); // Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg. MachineBasicBlock::const_reverse_iterator It = MI; @@ -3511,8 +3520,8 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) { unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig : III.ZeroIsSpecialNew + 1; - unsigned OrigZeroReg = MI.getOperand(PosForOrigZero).getReg(); - unsigned NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg(); + Register OrigZeroReg = MI.getOperand(PosForOrigZero).getReg(); + Register NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg(); // If R0 is in the operand where zero is special for the new instruction, // it is unsafe to transform if the constant operand isn't that operand. if ((NewZeroReg == PPC::R0 || NewZeroReg == PPC::X0) && @@ -3563,16 +3572,20 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, } else { // The 32 bit and 64 bit instructions are quite different. if (SpecialShift32) { - // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31). - uint64_t SH = RightShift ? 32 - ShAmt : ShAmt; + // Left shifts use (N, 0, 31-N). + // Right shifts use (32-N, N, 31) if 0 < N < 32. + // use (0, 0, 31) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt; uint64_t MB = RightShift ? ShAmt : 0; uint64_t ME = RightShift ? 31 : 31 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB) .addImm(ME); } else { - // Left shifts use (N, 63-N), right shifts use (64-N, N). - uint64_t SH = RightShift ? 64 - ShAmt : ShAmt; + // Left shifts use (N, 63-N). + // Right shifts use (64-N, N) if 0 < N < 64. + // use (0, 0) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt; uint64_t ME = RightShift ? ShAmt : 63 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME); @@ -3601,8 +3614,8 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, if (III.ZeroIsSpecialNew) { // If operand at III.ZeroIsSpecialNew is physical reg(eg: ZERO/ZERO8), no // need to fix up register class. - unsigned RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg(); - if (TargetRegisterInfo::isVirtualRegister(RegToModify)) { + Register RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg(); + if (Register::isVirtualRegister(RegToModify)) { const TargetRegisterClass *NewRC = MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ? &PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass; @@ -3747,7 +3760,7 @@ bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const { return false; unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); unsigned StackOffset = MI.getOperand(1).getImm(); - unsigned StackReg = MI.getOperand(2).getReg(); + Register StackReg = MI.getOperand(2).getReg(); if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset) return true; @@ -3772,7 +3785,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, switch (MI.getOpcode()) { case PPC::COPY: { - unsigned SrcReg = MI.getOperand(1).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); // In both ELFv1 and v2 ABI, method parameters and the return value // are sign- or zero-extended. @@ -3781,7 +3794,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, // We check the ZExt/SExt flags for a method parameter. if (MI.getParent()->getBasicBlock() == &MF->getFunction().getEntryBlock()) { - unsigned VReg = MI.getOperand(0).getReg(); + Register VReg = MI.getOperand(0).getReg(); if (MF->getRegInfo().isLiveIn(VReg)) return SignExt ? FuncInfo->isLiveInSExt(VReg) : FuncInfo->isLiveInZExt(VReg); @@ -3818,7 +3831,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, } // If this is a copy from another register, we recursively check source. - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + if (!Register::isVirtualRegister(SrcReg)) return false; const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); if (SrcMI != NULL) @@ -3841,8 +3854,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, case PPC::XORIS8: { // logical operation with 16-bit immediate does not change the upper bits. // So, we track the operand register as we do for register copy. - unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + Register SrcReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(SrcReg)) return false; const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); if (SrcMI != NULL) @@ -3870,8 +3883,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, for (unsigned I = 1; I != E; I += D) { if (MI.getOperand(I).isReg()) { - unsigned SrcReg = MI.getOperand(I).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + Register SrcReg = MI.getOperand(I).getReg(); + if (!Register::isVirtualRegister(SrcReg)) return false; const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1)) @@ -3893,12 +3906,12 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg()); - unsigned SrcReg1 = MI.getOperand(1).getReg(); - unsigned SrcReg2 = MI.getOperand(2).getReg(); + Register SrcReg1 = MI.getOperand(1).getReg(); + Register SrcReg2 = MI.getOperand(2).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg1) || - !TargetRegisterInfo::isVirtualRegister(SrcReg2)) - return false; + if (!Register::isVirtualRegister(SrcReg1) || + !Register::isVirtualRegister(SrcReg2)) + return false; const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1); const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2); @@ -3923,21 +3936,99 @@ bool PPCInstrInfo::isBDNZ(unsigned Opcode) const { return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ)); } -bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, - MachineInstr *&CmpInst) const { - MachineBasicBlock *LoopEnd = L.getBottomBlock(); - MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator(); - // We really "analyze" only CTR loops right now. - if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) { - IndVarInst = nullptr; - CmpInst = &*I; - return false; +namespace { +class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { + MachineInstr *Loop, *EndLoop, *LoopCount; + MachineFunction *MF; + const TargetInstrInfo *TII; + int64_t TripCount; + +public: + PPCPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop, + MachineInstr *LoopCount) + : Loop(Loop), EndLoop(EndLoop), LoopCount(LoopCount), + MF(Loop->getParent()->getParent()), + TII(MF->getSubtarget().getInstrInfo()) { + // Inspect the Loop instruction up-front, as it may be deleted when we call + // createTripCountGreaterCondition. + if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) + TripCount = LoopCount->getOperand(1).getImm(); + else + TripCount = -1; } - return true; + + bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { + // Only ignore the terminator. + return MI == EndLoop; + } + + Optional<bool> + createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB, + SmallVectorImpl<MachineOperand> &Cond) override { + if (TripCount == -1) { + // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1, + // so we don't need to generate any thing here. + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg( + MF->getSubtarget<PPCSubtarget>().isPPC64() ? PPC::CTR8 : PPC::CTR, + true)); + return {}; + } + + return TripCount > TC; + } + + void setPreheader(MachineBasicBlock *NewPreheader) override { + // Do nothing. We want the LOOP setup instruction to stay in the *old* + // preheader, so we can use BDZ in the prologs to adapt the loop trip count. + } + + void adjustTripCount(int TripCountAdjust) override { + // If the loop trip count is a compile-time value, then just change the + // value. + if (LoopCount->getOpcode() == PPC::LI8 || + LoopCount->getOpcode() == PPC::LI) { + int64_t TripCount = LoopCount->getOperand(1).getImm() + TripCountAdjust; + LoopCount->getOperand(1).setImm(TripCount); + return; + } + + // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1, + // so we don't need to generate any thing here. + } + + void disposed() override { + Loop->eraseFromParent(); + // Ensure the loop setup instruction is deleted too. + LoopCount->eraseFromParent(); + } +}; +} // namespace + +std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> +PPCInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + // We really "analyze" only hardware loops right now. + MachineBasicBlock::iterator I = LoopBB->getFirstTerminator(); + MachineBasicBlock *Preheader = *LoopBB->pred_begin(); + if (Preheader == LoopBB) + Preheader = *std::next(LoopBB->pred_begin()); + MachineFunction *MF = Preheader->getParent(); + + if (I != LoopBB->end() && isBDNZ(I->getOpcode())) { + SmallPtrSet<MachineBasicBlock *, 8> Visited; + if (MachineInstr *LoopInst = findLoopInstr(*Preheader, Visited)) { + Register LoopCountReg = LoopInst->getOperand(0).getReg(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg); + return std::make_unique<PPCPipelinerLoopInfo>(LoopInst, &*I, LoopCount); + } + } + return nullptr; } -MachineInstr * -PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const { +MachineInstr *PPCInstrInfo::findLoopInstr( + MachineBasicBlock &PreHeader, + SmallPtrSet<MachineBasicBlock *, 8> &Visited) const { unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop); @@ -3948,50 +4039,6 @@ PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const { return nullptr; } -unsigned PPCInstrInfo::reduceLoopCount( - MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar, - MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond, - SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter, - unsigned MaxIter) const { - // We expect a hardware loop currently. This means that IndVar is set - // to null, and the compare is the ENDLOOP instruction. - assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop"); - MachineFunction *MF = MBB.getParent(); - DebugLoc DL = Cmp.getDebugLoc(); - MachineInstr *Loop = findLoopInstr(PreHeader); - if (!Loop) - return 0; - unsigned LoopCountReg = Loop->getOperand(0).getReg(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg); - - if (!LoopCount) - return 0; - // If the loop trip count is a compile-time value, then just change the - // value. - if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) { - int64_t Offset = LoopCount->getOperand(1).getImm(); - if (Offset <= 1) { - LoopCount->eraseFromParent(); - Loop->eraseFromParent(); - return 0; - } - LoopCount->getOperand(1).setImm(Offset - 1); - return Offset - 1; - } - - // The loop trip count is a run-time value. - // We need to subtract one from the trip count, - // and insert branch later to check if we're done with the loop. - - // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1, - // so we don't need to generate any thing here. - Cond.push_back(MachineOperand::CreateImm(0)); - Cond.push_back(MachineOperand::CreateReg( - Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true)); - return LoopCountReg; -} - // Return true if get the base operand, byte offset of an instruction and the // memory width. Width is the size of memory that is being loaded/stored. bool PPCInstrInfo::getMemOperandWithOffsetWidth( @@ -4018,8 +4065,7 @@ bool PPCInstrInfo::getMemOperandWithOffsetWidth( } bool PPCInstrInfo::areMemAccessesTriviallyDisjoint( - const MachineInstr &MIa, const MachineInstr &MIb, - AliasAnalysis * /*AA*/) const { + const MachineInstr &MIa, const MachineInstr &MIb) const { assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index 70fb757e8f1e..19ab30cb0908 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -248,11 +248,11 @@ public: unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const override; + AAResults *AA) const override; unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, + bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; void insertNoop(MachineBasicBlock &MBB, @@ -370,8 +370,7 @@ public: /// otherwise bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. @@ -439,9 +438,14 @@ public: void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo, int64_t Imm) const; - bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III, + bool instrHasImmForm(unsigned Opc, bool IsVFReg, ImmInstrInfo &III, bool PostRA) const; + // In PostRA phase, try to find instruction defines \p Reg before \p MI. + // \p SeenIntermediate is set to true if uses between DefMI and \p MI exist. + MachineInstr *getDefMIPostRA(unsigned Reg, MachineInstr &MI, + bool &SeenIntermediateUse) const; + /// getRegNumForOperand - some operands use different numbering schemes /// for the same registers. For example, a VSX instruction may have any of /// vs0-vs63 allocated whereas an Altivec instruction could only have @@ -481,26 +485,14 @@ public: /// On PPC, we have two instructions used to set-up the hardware loop /// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8) /// instructions to indicate the end of a loop. - MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const; + MachineInstr * + findLoopInstr(MachineBasicBlock &PreHeader, + SmallPtrSet<MachineBasicBlock *, 8> &Visited) const; - /// Analyze the loop code to find the loop induction variable and compare used - /// to compute the number of iterations. Currently, we analyze loop that are - /// controlled using hardware loops. In this case, the induction variable - /// instruction is null. For all other cases, this function returns true, - /// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will - /// return new values when we can analyze the readonly loop \p L, otherwise, - /// nothing got changed - bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, - MachineInstr *&CmpInst) const override; - /// Generate code to reduce the loop iteration by one and check if the loop - /// is finished. Return the value/register of the new loop count. We need - /// this function when peeling off one or more iterations of a loop. This - /// function assumes the last iteration is peeled first. - unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, - MachineInstr *IndVar, MachineInstr &Cmp, - SmallVectorImpl<MachineOperand> &Cond, - SmallVectorImpl<MachineInstr *> &PrevInsts, - unsigned Iter, unsigned MaxIter) const override; + /// Analyze loop L, which must be a single-basic-block loop, and if the + /// conditions can be understood enough produce a PipelinerLoopInfo object. + std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> + analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; }; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index c313337047f0..24183277519b 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -386,7 +386,9 @@ def immZExt16 : PatLeaf<(imm), [{ // field. Used by instructions like 'ori'. return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue(); }], LO16>; -def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) || isUInt<8>(Imm); }]>; +def immNonAllOneAnyExt8 : ImmLeaf<i32, [{ + return (isInt<8>(Imm) && (Imm != -1)) || (isUInt<8>(Imm) && (Imm != 0xFF)); +}]>; def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>; // imm16Shifted* - These match immediates where the low 16-bits are zero. There @@ -577,7 +579,7 @@ def sperc : RegisterOperand<SPERC> { def PPCRegSPE4RCAsmOperand : AsmOperandClass { let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber"; } -def spe4rc : RegisterOperand<SPE4RC> { +def spe4rc : RegisterOperand<GPRC> { let ParserMatchClass = PPCRegSPE4RCAsmOperand; } @@ -3161,7 +3163,16 @@ def ADDISdtprelHA32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s1 def LWZtoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), "#LWZtoc", [(set i32:$rD, + (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor0:$reg), + "#LWZtocL", + [(set i32:$rD, (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp), + "#ADDIStocHA", + [(set i32:$rD, + (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>; + // Get Global (GOT) Base Register offset, from the word immediately preceding // the function label. def UpdateGBR : PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; @@ -3177,21 +3188,21 @@ def : Pat<(srl i32:$rS, i32:$rB), def : Pat<(shl i32:$rS, i32:$rB), (SLW $rS, $rB)>; -def : Pat<(zextloadi1 iaddr:$src), +def : Pat<(i32 (zextloadi1 iaddr:$src)), (LBZ iaddr:$src)>; -def : Pat<(zextloadi1 xaddr:$src), +def : Pat<(i32 (zextloadi1 xaddr:$src)), (LBZX xaddr:$src)>; -def : Pat<(extloadi1 iaddr:$src), +def : Pat<(i32 (extloadi1 iaddr:$src)), (LBZ iaddr:$src)>; -def : Pat<(extloadi1 xaddr:$src), +def : Pat<(i32 (extloadi1 xaddr:$src)), (LBZX xaddr:$src)>; -def : Pat<(extloadi8 iaddr:$src), +def : Pat<(i32 (extloadi8 iaddr:$src)), (LBZ iaddr:$src)>; -def : Pat<(extloadi8 xaddr:$src), +def : Pat<(i32 (extloadi8 xaddr:$src)), (LBZX xaddr:$src)>; -def : Pat<(extloadi16 iaddr:$src), +def : Pat<(i32 (extloadi16 iaddr:$src)), (LHZ iaddr:$src)>; -def : Pat<(extloadi16 xaddr:$src), +def : Pat<(i32 (extloadi16 xaddr:$src)), (LHZX xaddr:$src)>; let Predicates = [HasFPU] in { def : Pat<(f64 (extloadf32 iaddr:$src)), @@ -3564,23 +3575,6 @@ def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)), (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), (LO16 imm:$imm)), sub_eq)>; -defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), - (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)), - (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)), - (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)), - (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)), - (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; -defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)), - (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; - -defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)), - (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), - (LO16 imm:$imm)), sub_eq)>; - def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)), (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)), @@ -3592,17 +3586,6 @@ def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)), def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)), (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)), - (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)), - (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)), - (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)), - (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)), - (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; - // SETCC for i64. def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)), (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; @@ -3632,6 +3615,47 @@ def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)), (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), (LO16 imm:$imm)), sub_eq)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; + +// Instantiations of CRNotPat for i32. +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; + +// Instantiations of CRNotPat for i64. defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)), (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)), @@ -3649,17 +3673,6 @@ defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)), (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), (LO16 imm:$imm)), sub_eq)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)), - (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)), - (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)), - (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), - (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), - (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; - defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)), (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)), @@ -3671,6 +3684,56 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)), defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)), (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; +let Predicates = [HasFPU] in { +// Instantiations of CRNotPat for f32. +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; + +// Instantiations of CRNotPat for f64. +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; + +// Instantiations of CRNotPat for f128. +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; +} + // SETCC for f32. let Predicates = [HasFPU] in { def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)), @@ -3688,21 +3751,6 @@ def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)), def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)), (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; - // SETCC for f64. def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)), (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; @@ -3719,21 +3767,6 @@ def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)), def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)), (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; - // SETCC for f128. def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)), (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; @@ -3750,21 +3783,6 @@ def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)), def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)), (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; - } // This must be in this file because it relies on patterns defined in this file diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 07f38a61d098..2aad5860d87f 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -58,8 +58,12 @@ def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [ SDTCisVT<0, v4f32>, SDTCisPtrTy<1> ]>; -def SDT_PPCfpextlh : SDTypeProfile<1, 1, [ - SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32> +def SDT_PPCfpexth : SDTypeProfile<1, 2, [ + SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2> +]>; + +def SDT_PPCldsplat : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisPtrTy<1> ]>; // Little-endian-specific nodes. @@ -78,12 +82,21 @@ def SDTVecConv : SDTypeProfile<1, 2, [ def SDTVabsd : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32> ]>; - +def SDT_PPCld_vec_be : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; +def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x, [SDNPHasChain, SDNPMayStore]>; +def PPCld_vec_be : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be, + [SDNPHasChain, SDNPMayStore]>; def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>; def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; @@ -93,9 +106,11 @@ def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>; def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; -def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>; +def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>; def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -855,14 +870,14 @@ let Uses = [RM] in { let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in { - def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins), + def XXLXORz : XX3Form_SameOp<60, 154, (outs vsrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set v4i32:$XT, (v4i32 immAllZerosV))]>; - def XXLXORdpz : XX3Form_SetZero<60, 154, + def XXLXORdpz : XX3Form_SameOp<60, 154, (outs vsfrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set f64:$XT, (fpimm0))]>; - def XXLXORspz : XX3Form_SetZero<60, 154, + def XXLXORspz : XX3Form_SameOp<60, 154, (outs vssrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set f32:$XT, (fpimm0))]>; @@ -996,21 +1011,21 @@ def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG $S, sub_64))>; } -// Additional fnmsub patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), - (XSNMSUBADP $B, $C, $A)>; -def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), - (XSNMSUBADP $B, $C, $A)>; +// Additional fnmsub patterns: -a*b + c == -(a*b - c) +def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C), + (XSNMSUBADP $C, $A, $B)>; +def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C), + (XSNMSUBADP $C, $A, $B)>; -def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B), - (XVNMSUBADP $B, $C, $A)>; -def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B), - (XVNMSUBADP $B, $C, $A)>; +def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C), + (XVNMSUBADP $C, $A, $B)>; +def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C), + (XVNMSUBADP $C, $A, $B)>; -def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), - (XVNMSUBASP $B, $C, $A)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), - (XVNMSUBASP $B, $C, $A)>; +def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C), + (XVNMSUBASP $C, $A, $B)>; +def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C), + (XVNMSUBASP $C, $A, $B)>; def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; @@ -1077,7 +1092,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)), def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)), (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>; -def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>; +def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>; +def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>; // Loads. let Predicates = [HasVSX, HasOnlySwappingMemOps] in { @@ -1088,6 +1104,19 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in { (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; } + +// Load vector big endian order +let Predicates = [IsLittleEndian, HasVSX] in { + def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; + def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; +} + let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; @@ -1288,6 +1317,13 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B), (XXLEQV $A, $B)>; + let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1, + isReMaterializable = 1 in { + def XXLEQVOnes : XX3Form_SameOp<60, 186, (outs vsrc:$XT), (ins), + "xxleqv $XT, $XT, $XT", IIC_VecGeneral, + [(set v4i32:$XT, (bitconvert (v16i8 immAllOnesV)))]>; + } + def XXLORC : XX3Form<60, 170, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlorc $XT, $XA, $XB", IIC_VecGeneral, @@ -1476,6 +1512,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. AltVSXFMARel; } + // Additional xsnmsubasp patterns: -a*b + c == -(a*b - c) + def : Pat<(fma (fneg f32:$A), f32:$B, f32:$C), + (XSNMSUBASP $C, $A, $B)>; + def : Pat<(fma f32:$A, (fneg f32:$B), f32:$C), + (XSNMSUBASP $C, $A, $B)>; + // Single Precision Conversions (FP <-> INT) def XSCVSXDSP : XX2Form<60, 312, (outs vssrc:$XT), (ins vsfrc:$XB), @@ -1564,16 +1606,33 @@ let Predicates = [HasDirectMove] in { def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT), "mfvsrwz $rA, $XT", IIC_VecGeneral, [(set i32:$rA, (PPCmfvsr f64:$XT))]>; + let isCodeGenOnly = 1 in + def MFVRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsrc:$XT), + "mfvsrwz $rA, $XT", IIC_VecGeneral, + []>; def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA), "mtvsrd $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsra i64:$rA))]>, Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in + def MTVRD : XX1_RS6_RD5_XO<31, 179, (outs vsrc:$XT), (ins g8rc:$rA), + "mtvsrd $XT, $rA", IIC_VecGeneral, + []>, + Requires<[In64BitMode]>; def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA), "mtvsrwa $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsra i32:$rA))]>; + let isCodeGenOnly = 1 in + def MTVRWA : XX1_RS6_RD5_XO<31, 211, (outs vsrc:$XT), (ins gprc:$rA), + "mtvsrwa $XT, $rA", IIC_VecGeneral, + []>; def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA), "mtvsrwz $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsrz i32:$rA))]>; + let isCodeGenOnly = 1 in + def MTVRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsrc:$XT), (ins gprc:$rA), + "mtvsrwz $XT, $rA", IIC_VecGeneral, + []>; } // HasDirectMove let Predicates = [IsISA3_0, HasDirectMove] in { @@ -1597,6 +1656,22 @@ def : InstAlias<"mfvrd $rA, $XT", (MFVRD g8rc:$rA, vrrc:$XT), 0>; def : InstAlias<"mffprd $rA, $src", (MFVSRD g8rc:$rA, f8rc:$src)>; +def : InstAlias<"mtvrd $XT, $rA", + (MTVRD vrrc:$XT, g8rc:$rA), 0>; +def : InstAlias<"mtfprd $dst, $rA", + (MTVSRD f8rc:$dst, g8rc:$rA)>; +def : InstAlias<"mfvrwz $rA, $XT", + (MFVRWZ gprc:$rA, vrrc:$XT), 0>; +def : InstAlias<"mffprwz $rA, $src", + (MFVSRWZ gprc:$rA, f8rc:$src)>; +def : InstAlias<"mtvrwa $XT, $rA", + (MTVRWA vrrc:$XT, gprc:$rA), 0>; +def : InstAlias<"mtfprwa $dst, $rA", + (MTVSRWA f8rc:$dst, gprc:$rA)>; +def : InstAlias<"mtvrwz $XT, $rA", + (MTVRWZ vrrc:$XT, gprc:$rA), 0>; +def : InstAlias<"mtfprwz $dst, $rA", + (MTVSRWZ f8rc:$dst, gprc:$rA)>; /* Direct moves of various widths from GPR's into VSR's. Each move lines the value up into element 0 (both BE and LE). Namely, entities smaller than @@ -2581,9 +2656,9 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (fneg (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, (fneg f128:$vTi))))]>; - // Additional fnmsub patterns: -a*c + b == -(a*c - b) - def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>; - def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>; + // Additional fnmsub patterns: -a*b + c == -(a*b - c) + def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>; + def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>; //===--------------------------------------------------------------------===// // Quad/Double-Precision Compare Instructions: @@ -2799,12 +2874,12 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB), "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP, [(set v4i32: $XT, - (int_ppc_vsx_xvtstdcsp v4f32:$XB, imm:$DCMX))]>; + (int_ppc_vsx_xvtstdcsp v4f32:$XB, timm:$DCMX))]>; def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5, (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB), "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP, [(set v2i64: $XT, - (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>; + (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>; //===--------------------------------------------------------------------===// @@ -3024,6 +3099,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>; + + def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)), + (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>; + def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst), + (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>; + + def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)), + (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>; + def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst), + (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>; } // IsLittleEndian, HasP9Vector let Predicates = [IsBigEndian, HasP9Vector] in { @@ -3059,7 +3144,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>; - } // IsLittleEndian, HasP9Vector + } // IsBigEndian, HasP9Vector // D-Form Load/Store def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>; @@ -3858,6 +3943,10 @@ let AddedComplexity = 400 in { (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; + def : Pat<(v2f64 (PPCldsplat xoaddr:$A)), + (v2f64 (LXVDSX xoaddr:$A))>; + def : Pat<(v2i64 (PPCldsplat xoaddr:$A)), + (v2i64 (LXVDSX xoaddr:$A))>; // Build vectors of floating point converted to i64. def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), @@ -4063,27 +4152,32 @@ let AddedComplexity = 400 in { (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>; } + let Predicates = [HasP8Vector] in { + def : Pat<(v1i128 (bitconvert (v16i8 immAllOnesV))), + (v1i128 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>; + def : Pat<(v2i64 (bitconvert (v16i8 immAllOnesV))), + (v2i64 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>; + def : Pat<(v8i16 (bitconvert (v16i8 immAllOnesV))), + (v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>; + def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))), + (v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>; + } + let Predicates = [HasP9Vector] in { // Endianness-neutral patterns for const splats with ISA 3.0 instructions. def : Pat<(v4i32 (scalar_to_vector i32:$A)), (v4i32 (MTVSRWS $A))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), (v4i32 (MTVSRWS $A))>; - def : Pat<(v16i8 (build_vector immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, - immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, - immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, - immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, - immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, - immAnyExt8:$A)), + def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A, + immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)), (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>; - def : Pat<(v16i8 immAllOnesV), - (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; - def : Pat<(v8i16 immAllOnesV), - (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; - def : Pat<(v4i32 immAllOnesV), - (v4i32 (XXSPLTIB 255))>; - def : Pat<(v2i64 immAllOnesV), - (v2i64 (XXSPLTIB 255))>; def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)), (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), @@ -4102,6 +4196,10 @@ let AddedComplexity = 400 in { (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0))>; + def : Pat<(v4f32 (PPCldsplat xoaddr:$A)), + (v4f32 (LXVWSX xoaddr:$A))>; + def : Pat<(v4i32 (PPCldsplat xoaddr:$A)), + (v4i32 (LXVWSX xoaddr:$A))>; } let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index 4d45d96d4479..d252cfbd26b1 100644 --- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -63,8 +63,24 @@ static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars", cl::desc("Potential PHI threshold for PPC preinc loop prep")); STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form"); +STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten"); namespace { + struct BucketElement { + BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} + + const SCEVConstant *Offset; + Instruction *Instr; + }; + + struct Bucket { + Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), + Elements(1, BucketElement(I)) {} + + const SCEV *BaseSCEV; + SmallVector<BucketElement, 16> Elements; + }; class PPCLoopPreIncPrep : public FunctionPass { public: @@ -85,21 +101,47 @@ namespace { AU.addRequired<ScalarEvolutionWrapperPass>(); } - bool alreadyPrepared(Loop *L, Instruction* MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV); bool runOnFunction(Function &F) override; - bool runOnLoop(Loop *L); - void simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L); - private: PPCTargetMachine *TM = nullptr; + const PPCSubtarget *ST; DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; bool PreserveLCSSA; + + bool runOnLoop(Loop *L); + + /// Check if required PHI node is already exist in Loop \p L. + bool alreadyPrepared(Loop *L, Instruction* MemI, + const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV); + + /// Collect condition matched(\p isValidCandidate() returns true) + /// candidates in Loop \p L. + SmallVector<Bucket, 16> + collectCandidates(Loop *L, + std::function<bool(const Instruction *, const Value *)> + isValidCandidate, + unsigned MaxCandidateNum); + + /// Add a candidate to candidates \p Buckets. + void addOneCandidate(Instruction *MemI, const SCEV *LSCEV, + SmallVector<Bucket, 16> &Buckets, + unsigned MaxCandidateNum); + + /// Prepare all candidates in \p Buckets for update form. + bool updateFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets); + + /// Prepare for one chain \p BucketChain, find the best base element and + /// update all other elements in \p BucketChain accordingly. + bool prepareBaseForUpdateFormChain(Bucket &BucketChain); + + /// Rewrite load/store instructions in \p BucketChain according to + /// preparation. + bool rewriteLoadStores(Loop *L, Bucket &BucketChain, + SmallSet<BasicBlock *, 16> &BBChanged); }; } // end anonymous namespace @@ -111,30 +153,15 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) +static const std::string PHINodeNameSuffix = ".phi"; +static const std::string CastNodeNameSuffix = ".cast"; +static const std::string GEPNodeIncNameSuffix = ".inc"; +static const std::string GEPNodeOffNameSuffix = ".off"; + FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { return new PPCLoopPreIncPrep(TM); } -namespace { - - struct BucketElement { - BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} - BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} - - const SCEVConstant *Offset; - Instruction *Instr; - }; - - struct Bucket { - Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), - Elements(1, BucketElement(I)) {} - - const SCEV *BaseSCEV; - SmallVector<BucketElement, 16> Elements; - }; - -} // end anonymous namespace - static bool IsPtrInBounds(Value *BasePtr) { Value *StrippedBasePtr = BasePtr; while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr)) @@ -145,6 +172,14 @@ static bool IsPtrInBounds(Value *BasePtr) { return false; } +static std::string getInstrName(const Value *I, const std::string Suffix) { + assert(I && "Invalid paramater!"); + if (I->hasName()) + return (I->getName() + Suffix).str(); + else + return ""; +} + static Value *GetPointerOperand(Value *MemI) { if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) { return LMemI->getPointerOperand(); @@ -167,6 +202,7 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) { auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + ST = TM ? TM->getSubtargetImpl(F) : nullptr; bool MadeChange = false; @@ -177,10 +213,280 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) { return MadeChange; } +void PPCLoopPreIncPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV, + SmallVector<Bucket, 16> &Buckets, + unsigned MaxCandidateNum) { + assert((MemI && GetPointerOperand(MemI)) && + "Candidate should be a memory instruction."); + assert(LSCEV && "Invalid SCEV for Ptr value."); + bool FoundBucket = false; + for (auto &B : Buckets) { + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) { + B.Elements.push_back(BucketElement(CDiff, MemI)); + FoundBucket = true; + break; + } + } + + if (!FoundBucket) { + if (Buckets.size() == MaxCandidateNum) + return; + Buckets.push_back(Bucket(LSCEV, MemI)); + } +} + +SmallVector<Bucket, 16> PPCLoopPreIncPrep::collectCandidates( + Loop *L, + std::function<bool(const Instruction *, const Value *)> isValidCandidate, + unsigned MaxCandidateNum) { + SmallVector<Bucket, 16> Buckets; + for (const auto &BB : L->blocks()) + for (auto &J : *BB) { + Value *PtrValue; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast<LoadInst>(&J)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&J)) { + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { + MemI = IMemI; + PtrValue = IMemI->getArgOperand(0); + } else continue; + } else continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); + const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LARSCEV || LARSCEV->getLoop() != L) + continue; + + if (isValidCandidate(&J, PtrValue)) + addOneCandidate(MemI, LSCEV, Buckets, MaxCandidateNum); + } + return Buckets; +} + +// TODO: implement a more clever base choosing policy. +// Currently we always choose an exist load/store offset. This maybe lead to +// suboptimal code sequences. For example, for one DS chain with offsets +// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp +// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a +// multipler of 4, it cannot be represented by sint16. +bool PPCLoopPreIncPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) { + // We have a choice now of which instruction's memory operand we use as the + // base for the generated PHI. Always picking the first instruction in each + // bucket does not work well, specifically because that instruction might + // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, + // the choice is somewhat arbitrary, because the backend will happily + // generate direct offsets from both the pre-incremented and + // post-incremented pointer values. Thus, we'll pick the first non-prefetch + // instruction in each bucket, and adjust the recurrence and other offsets + // accordingly. + for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) { + if (auto *II = dyn_cast<IntrinsicInst>(BucketChain.Elements[j].Instr)) + if (II->getIntrinsicID() == Intrinsic::prefetch) + continue; + + // If we'd otherwise pick the first element anyway, there's nothing to do. + if (j == 0) + break; + + // If our chosen element has no offset from the base pointer, there's + // nothing to do. + if (!BucketChain.Elements[j].Offset || + BucketChain.Elements[j].Offset->isZero()) + break; + + const SCEV *Offset = BucketChain.Elements[j].Offset; + BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); + for (auto &E : BucketChain.Elements) { + if (E.Offset) + E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); + } + + std::swap(BucketChain.Elements[j], BucketChain.Elements[0]); + break; + } + return true; +} + +bool PPCLoopPreIncPrep::rewriteLoadStores( + Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged) { + bool MadeChange = false; + const SCEVAddRecExpr *BasePtrSCEV = + cast<SCEVAddRecExpr>(BucketChain.BaseSCEV); + if (!BasePtrSCEV->isAffine()) + return MadeChange; + + LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); + + assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); + + // The instruction corresponding to the Bucket's BaseSCEV must be the first + // in the vector of elements. + Instruction *MemI = BucketChain.Elements.begin()->Instr; + Value *BasePtr = GetPointerOperand(MemI); + assert(BasePtr && "No pointer operand"); + + Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); + Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), + BasePtr->getType()->getPointerAddressSpace()); + + const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart(); + if (!SE->isLoopInvariant(BasePtrStartSCEV, L)) + return MadeChange; + + const SCEVConstant *BasePtrIncSCEV = + dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)); + if (!BasePtrIncSCEV) + return MadeChange; + BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV); + if (!isSafeToExpand(BasePtrStartSCEV, *SE)) + return MadeChange; + + if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV)) + return MadeChange; + + LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); + + BasicBlock *Header = L->getHeader(); + unsigned HeaderLoopPredCount = pred_size(Header); + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + + PHINode *NewPHI = + PHINode::Create(I8PtrTy, HeaderLoopPredCount, + getInstrName(MemI, PHINodeNameSuffix), + Header->getFirstNonPHI()); + + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); + Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, + LoopPredecessor->getTerminator()); + + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to add it the right number of times. + for (const auto &PI : predecessors(Header)) { + if (PI != LoopPredecessor) + continue; + + NewPHI->addIncoming(BasePtrStart, LoopPredecessor); + } + + Instruction *InsPoint = &*Header->getFirstInsertionPt(); + GetElementPtrInst *PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); + PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); + for (const auto &PI : predecessors(Header)) { + if (PI == LoopPredecessor) + continue; + + NewPHI->addIncoming(PtrInc, PI); + } + + Instruction *NewBasePtr; + if (PtrInc->getType() != BasePtr->getType()) + NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(), + getInstrName(PtrInc, CastNodeNameSuffix), InsPoint); + else + NewBasePtr = PtrInc; + + if (Instruction *IDel = dyn_cast<Instruction>(BasePtr)) + BBChanged.insert(IDel->getParent()); + BasePtr->replaceAllUsesWith(NewBasePtr); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr); + + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet<Value *, 16> NewPtrs; + NewPtrs.insert(NewBasePtr); + + for (auto I = std::next(BucketChain.Elements.begin()), + IE = BucketChain.Elements.end(); I != IE; ++I) { + Value *Ptr = GetPointerOperand(I->Instr); + assert(Ptr && "No pointer operand"); + if (NewPtrs.count(Ptr)) + continue; + + Instruction *RealNewPtr; + if (!I->Offset || I->Offset->getValue()->isZero()) { + RealNewPtr = NewBasePtr; + } else { + Instruction *PtrIP = dyn_cast<Instruction>(Ptr); + if (PtrIP && isa<Instruction>(NewBasePtr) && + cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent()) + PtrIP = nullptr; + else if (PtrIP && isa<PHINode>(PtrIP)) + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); + else if (!PtrIP) + PtrIP = I->Instr; + + GetElementPtrInst *NewPtr = GetElementPtrInst::Create( + I8Ty, PtrInc, I->Offset->getValue(), + getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP); + if (!PtrIP) + NewPtr->insertAfter(cast<Instruction>(PtrInc)); + NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); + RealNewPtr = NewPtr; + } + + if (Instruction *IDel = dyn_cast<Instruction>(Ptr)) + BBChanged.insert(IDel->getParent()); + + Instruction *ReplNewPtr; + if (Ptr->getType() != RealNewPtr->getType()) { + ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), + getInstrName(Ptr, CastNodeNameSuffix)); + ReplNewPtr->insertAfter(RealNewPtr); + } else + ReplNewPtr = RealNewPtr; + + Ptr->replaceAllUsesWith(ReplNewPtr); + RecursivelyDeleteTriviallyDeadInstructions(Ptr); + + NewPtrs.insert(RealNewPtr); + } + + MadeChange = true; + UpdFormChainRewritten++; + + return MadeChange; +} + +bool PPCLoopPreIncPrep::updateFormPrep(Loop *L, + SmallVector<Bucket, 16> &Buckets) { + bool MadeChange = false; + if (Buckets.empty()) + return MadeChange; + SmallSet<BasicBlock *, 16> BBChanged; + for (auto &Bucket : Buckets) + // The base address of each bucket is transformed into a phi and the others + // are rewritten based on new base. + if (prepareBaseForUpdateFormChain(Bucket)) + MadeChange |= rewriteLoadStores(L, Bucket, BBChanged); + if (MadeChange) + for (auto &BB : L->blocks()) + if (BBChanged.count(BB)) + DeleteDeadPHIs(BB); + return MadeChange; +} + // In order to prepare for the pre-increment a PHI is added. // This function will check to see if that PHI already exists and will return -// true if it found an existing PHI with the same start and increment as the -// one we wanted to create. +// true if it found an existing PHI with the same start and increment as the +// one we wanted to create. bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI, const SCEV *BasePtrStartSCEV, const SCEVConstant *BasePtrIncSCEV) { @@ -216,10 +522,10 @@ bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI, continue; if (CurrentPHINode->getNumIncomingValues() == 2) { - if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB && - CurrentPHINode->getIncomingBlock(1) == PredBB) || - (CurrentPHINode->getIncomingBlock(1) == LatchBB && - CurrentPHINode->getIncomingBlock(0) == PredBB) ) { + if ((CurrentPHINode->getIncomingBlock(0) == LatchBB && + CurrentPHINode->getIncomingBlock(1) == PredBB) || + (CurrentPHINode->getIncomingBlock(1) == LatchBB && + CurrentPHINode->getIncomingBlock(0) == PredBB)) { if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV && PHIBasePtrIncSCEV == BasePtrIncSCEV) { // The existing PHI (CurrentPHINode) has the same start and increment @@ -242,89 +548,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); - BasicBlock *Header = L->getHeader(); - - const PPCSubtarget *ST = - TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr; - - unsigned HeaderLoopPredCount = pred_size(Header); - - // Collect buckets of comparable addresses used by loads and stores. - SmallVector<Bucket, 16> Buckets; - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) { - for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); - J != JE; ++J) { - Value *PtrValue; - Instruction *MemI; - - if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) { - MemI = LMemI; - PtrValue = LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) { - MemI = SMemI; - PtrValue = SMemI->getPointerOperand(); - } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { - MemI = IMemI; - PtrValue = IMemI->getArgOperand(0); - } else continue; - } else continue; - - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); - if (PtrAddrSpace) - continue; - - // There are no update forms for Altivec vector load/stores. - if (ST && ST->hasAltivec() && - PtrValue->getType()->getPointerElementType()->isVectorTy()) - continue; - - if (L->isLoopInvariant(PtrValue)) - continue; - - const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); - if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) { - if (LARSCEV->getLoop() != L) - continue; - // See getPreIndexedAddressParts, the displacement for LDU/STDU has to - // be 4's multiple (DS-form). For i64 loads/stores when the displacement - // fits in a 16-bit signed field but isn't a multiple of 4, it will be - // useless and possible to break some original well-form addressing mode - // to make this pre-inc prep for it. - if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) { - if (const SCEVConstant *StepConst = - dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) { - const APInt &ConstInt = StepConst->getValue()->getValue(); - if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) - continue; - } - } - } else { - continue; - } - - bool FoundBucket = false; - for (auto &B : Buckets) { - const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); - if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) { - B.Elements.push_back(BucketElement(CDiff, MemI)); - FoundBucket = true; - break; - } - } - - if (!FoundBucket) { - if (Buckets.size() == MaxVars) - return MadeChange; - Buckets.push_back(Bucket(LSCEV, MemI)); - } - } - } - - if (Buckets.empty()) - return MadeChange; - BasicBlock *LoopPredecessor = L->getLoopPredecessor(); // If there is no loop predecessor, or the loop predecessor's terminator // returns a value (which might contribute to determining the loop's @@ -335,191 +558,48 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { if (LoopPredecessor) MadeChange = true; } - if (!LoopPredecessor) + if (!LoopPredecessor) { + LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n"); return MadeChange; + } - LLVM_DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n"); - - SmallSet<BasicBlock *, 16> BBChanged; - for (unsigned i = 0, e = Buckets.size(); i != e; ++i) { - // The base address of each bucket is transformed into a phi and the others - // are rewritten as offsets of that variable. - - // We have a choice now of which instruction's memory operand we use as the - // base for the generated PHI. Always picking the first instruction in each - // bucket does not work well, specifically because that instruction might - // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, - // the choice is somewhat arbitrary, because the backend will happily - // generate direct offsets from both the pre-incremented and - // post-incremented pointer values. Thus, we'll pick the first non-prefetch - // instruction in each bucket, and adjust the recurrence and other offsets - // accordingly. - for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { - if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr)) - if (II->getIntrinsicID() == Intrinsic::prefetch) - continue; - - // If we'd otherwise pick the first element anyway, there's nothing to do. - if (j == 0) - break; - - // If our chosen element has no offset from the base pointer, there's - // nothing to do. - if (!Buckets[i].Elements[j].Offset || - Buckets[i].Elements[j].Offset->isZero()) - break; - - const SCEV *Offset = Buckets[i].Elements[j].Offset; - Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset); - for (auto &E : Buckets[i].Elements) { - if (E.Offset) - E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); - else - E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); - } - - std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]); - break; - } - - const SCEVAddRecExpr *BasePtrSCEV = - cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV); - if (!BasePtrSCEV->isAffine()) - continue; - - LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); - assert(BasePtrSCEV->getLoop() == L && - "AddRec for the wrong loop?"); - - // The instruction corresponding to the Bucket's BaseSCEV must be the first - // in the vector of elements. - Instruction *MemI = Buckets[i].Elements.begin()->Instr; - Value *BasePtr = GetPointerOperand(MemI); - assert(BasePtr && "No pointer operand"); - - Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); - Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), - BasePtr->getType()->getPointerAddressSpace()); - - const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart(); - if (!SE->isLoopInvariant(BasePtrStartSCEV, L)) - continue; - - const SCEVConstant *BasePtrIncSCEV = - dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)); - if (!BasePtrIncSCEV) - continue; - BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV); - if (!isSafeToExpand(BasePtrStartSCEV, *SE)) - continue; - - LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); - - if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV)) - continue; - - PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, - MemI->hasName() ? MemI->getName() + ".phi" : "", - Header->getFirstNonPHI()); - - SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); - Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, - LoopPredecessor->getTerminator()); - - // Note that LoopPredecessor might occur in the predecessor list multiple - // times, and we need to add it the right number of times. - for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); - PI != PE; ++PI) { - if (*PI != LoopPredecessor) - continue; - - NewPHI->addIncoming(BasePtrStart, LoopPredecessor); - } - - Instruction *InsPoint = &*Header->getFirstInsertionPt(); - GetElementPtrInst *PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), - MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); - PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); - for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); - PI != PE; ++PI) { - if (*PI == LoopPredecessor) - continue; - - NewPHI->addIncoming(PtrInc, *PI); - } - - Instruction *NewBasePtr; - if (PtrInc->getType() != BasePtr->getType()) - NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(), - PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint); - else - NewBasePtr = PtrInc; - - if (Instruction *IDel = dyn_cast<Instruction>(BasePtr)) - BBChanged.insert(IDel->getParent()); - BasePtr->replaceAllUsesWith(NewBasePtr); - RecursivelyDeleteTriviallyDeadInstructions(BasePtr); - - // Keep track of the replacement pointer values we've inserted so that we - // don't generate more pointer values than necessary. - SmallPtrSet<Value *, 16> NewPtrs; - NewPtrs.insert( NewBasePtr); - - for (auto I = std::next(Buckets[i].Elements.begin()), - IE = Buckets[i].Elements.end(); I != IE; ++I) { - Value *Ptr = GetPointerOperand(I->Instr); - assert(Ptr && "No pointer operand"); - if (NewPtrs.count(Ptr)) - continue; - - Instruction *RealNewPtr; - if (!I->Offset || I->Offset->getValue()->isZero()) { - RealNewPtr = NewBasePtr; - } else { - Instruction *PtrIP = dyn_cast<Instruction>(Ptr); - if (PtrIP && isa<Instruction>(NewBasePtr) && - cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent()) - PtrIP = nullptr; - else if (isa<PHINode>(PtrIP)) - PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); - else if (!PtrIP) - PtrIP = I->Instr; - - GetElementPtrInst *NewPtr = GetElementPtrInst::Create( - I8Ty, PtrInc, I->Offset->getValue(), - I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); - if (!PtrIP) - NewPtr->insertAfter(cast<Instruction>(PtrInc)); - NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); - RealNewPtr = NewPtr; + // Check if a load/store has update form. This lambda is used by function + // collectCandidates which can collect candidates for types defined by lambda. + auto isUpdateFormCandidate = [&] (const Instruction *I, + const Value *PtrValue) { + assert((PtrValue && I) && "Invalid parameter!"); + // There are no update forms for Altivec vector load/stores. + if (ST && ST->hasAltivec() && + PtrValue->getType()->getPointerElementType()->isVectorTy()) + return false; + // See getPreIndexedAddressParts, the displacement for LDU/STDU has to + // be 4's multiple (DS-form). For i64 loads/stores when the displacement + // fits in a 16-bit signed field but isn't a multiple of 4, it will be + // useless and possible to break some original well-form addressing mode + // to make this pre-inc prep for it. + if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) { + const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L); + const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LARSCEV || LARSCEV->getLoop() != L) + return false; + if (const SCEVConstant *StepConst = + dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) { + const APInt &ConstInt = StepConst->getValue()->getValue(); + if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) + return false; } - - if (Instruction *IDel = dyn_cast<Instruction>(Ptr)) - BBChanged.insert(IDel->getParent()); - - Instruction *ReplNewPtr; - if (Ptr->getType() != RealNewPtr->getType()) { - ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), - Ptr->hasName() ? Ptr->getName() + ".cast" : ""); - ReplNewPtr->insertAfter(RealNewPtr); - } else - ReplNewPtr = RealNewPtr; - - Ptr->replaceAllUsesWith(ReplNewPtr); - RecursivelyDeleteTriviallyDeadInstructions(Ptr); - - NewPtrs.insert(RealNewPtr); } + return true; + }; - MadeChange = true; - } + // Collect buckets of comparable addresses used by loads, stores and prefetch + // intrinsic for update form. + SmallVector<Bucket, 16> UpdateFormBuckets = + collectCandidates(L, isUpdateFormCandidate, MaxVars); - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) { - if (BBChanged.count(*I)) - DeleteDeadPHIs(*I); - } + // Prepare for update form. + if (!UpdateFormBuckets.empty()) + MadeChange |= updateFormPrep(L, UpdateFormBuckets); return MadeChange; } diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index 027e6bd1ba06..b6496f189a3a 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -79,7 +79,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, } static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, - AsmPrinter &Printer, bool isDarwin) { + AsmPrinter &Printer, bool IsDarwin) { MCContext &Ctx = Printer.OutContext; MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; @@ -137,10 +137,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, // Add ha16() / lo16() markers if required. switch (access) { case PPCII::MO_LO: - Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx); + Expr = PPCMCExpr::createLo(Expr, IsDarwin, Ctx); break; case PPCII::MO_HA: - Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx); + Expr = PPCMCExpr::createHa(Expr, IsDarwin, Ctx); break; } @@ -148,20 +148,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, } void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - AsmPrinter &AP, bool isDarwin) { + AsmPrinter &AP, bool IsDarwin) { OutMI.setOpcode(MI->getOpcode()); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MCOperand MCOp; if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP, - isDarwin)) + IsDarwin)) OutMI.addOperand(MCOp); } } bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &OutMO, AsmPrinter &AP, - bool isDarwin) { + bool IsDarwin) { switch (MO.getType()) { default: llvm_unreachable("unknown operand type"); @@ -181,17 +181,20 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, return true; case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ExternalSymbol: - OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); + OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, IsDarwin); return true; case MachineOperand::MO_JumpTableIndex: - OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); + OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, IsDarwin); return true; case MachineOperand::MO_ConstantPoolIndex: - OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); + OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, IsDarwin); return true; case MachineOperand::MO_BlockAddress: OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP, - isDarwin); + IsDarwin); + return true; + case MachineOperand::MO_MCSymbol: + OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP, IsDarwin); return true; case MachineOperand::MO_RegisterMask: return false; diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index 446246358e96..ac8ac060f460 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -148,8 +148,8 @@ static MachineInstr *getVRegDefOrNull(MachineOperand *Op, if (!Op->isReg()) return nullptr; - unsigned Reg = Op->getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + Register Reg = Op->getReg(); + if (!Register::isVirtualRegister(Reg)) return nullptr; return MRI->getVRegDef(Reg); @@ -344,8 +344,7 @@ bool PPCMIPeephole::simplifyCode(void) { unsigned TrueReg2 = TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI); - if (TrueReg1 == TrueReg2 - && TargetRegisterInfo::isVirtualRegister(TrueReg1)) { + if (TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1)) { MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0; @@ -358,7 +357,7 @@ bool PPCMIPeephole::simplifyCode(void) { return false; unsigned DefReg = TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); - if (TargetRegisterInfo::isVirtualRegister(DefReg)) { + if (Register::isVirtualRegister(DefReg)) { MachineInstr *LoadMI = MRI->getVRegDef(DefReg); if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX) return true; @@ -444,7 +443,7 @@ bool PPCMIPeephole::simplifyCode(void) { unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2; unsigned TrueReg = TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI); - if (!TargetRegisterInfo::isVirtualRegister(TrueReg)) + if (!Register::isVirtualRegister(TrueReg)) break; MachineInstr *DefMI = MRI->getVRegDef(TrueReg); if (!DefMI) @@ -453,8 +452,8 @@ bool PPCMIPeephole::simplifyCode(void) { auto isConvertOfSplat = [=]() -> bool { if (DefOpcode != PPC::XVCVSPSXWS && DefOpcode != PPC::XVCVSPUXWS) return false; - unsigned ConvReg = DefMI->getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(ConvReg)) + Register ConvReg = DefMI->getOperand(1).getReg(); + if (!Register::isVirtualRegister(ConvReg)) return false; MachineInstr *Splt = MRI->getVRegDef(ConvReg); return Splt && (Splt->getOpcode() == PPC::LXVWSX || @@ -481,9 +480,9 @@ bool PPCMIPeephole::simplifyCode(void) { // Splat fed by a shift. Usually when we align value to splat into // vector element zero. if (DefOpcode == PPC::XXSLDWI) { - unsigned ShiftRes = DefMI->getOperand(0).getReg(); - unsigned ShiftOp1 = DefMI->getOperand(1).getReg(); - unsigned ShiftOp2 = DefMI->getOperand(2).getReg(); + Register ShiftRes = DefMI->getOperand(0).getReg(); + Register ShiftOp1 = DefMI->getOperand(1).getReg(); + Register ShiftOp2 = DefMI->getOperand(2).getReg(); unsigned ShiftImm = DefMI->getOperand(3).getImm(); unsigned SplatImm = MI.getOperand(2).getImm(); if (ShiftOp1 == ShiftOp2) { @@ -507,7 +506,7 @@ bool PPCMIPeephole::simplifyCode(void) { // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant. unsigned TrueReg = TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); - if (!TargetRegisterInfo::isVirtualRegister(TrueReg)) + if (!Register::isVirtualRegister(TrueReg)) break; MachineInstr *DefMI = MRI->getVRegDef(TrueReg); @@ -518,8 +517,8 @@ bool PPCMIPeephole::simplifyCode(void) { TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); unsigned DefsReg2 = TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI); - if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) || - !TargetRegisterInfo::isVirtualRegister(DefsReg2)) + if (!Register::isVirtualRegister(DefsReg1) || + !Register::isVirtualRegister(DefsReg2)) break; MachineInstr *P1 = MRI->getVRegDef(DefsReg1); MachineInstr *P2 = MRI->getVRegDef(DefsReg2); @@ -533,8 +532,8 @@ bool PPCMIPeephole::simplifyCode(void) { if (RoundInstr->getOpcode() == PPC::FRSP && MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) { Simplified = true; - unsigned ConvReg1 = RoundInstr->getOperand(1).getReg(); - unsigned FRSPDefines = RoundInstr->getOperand(0).getReg(); + Register ConvReg1 = RoundInstr->getOperand(1).getReg(); + Register FRSPDefines = RoundInstr->getOperand(0).getReg(); MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines)); for (int i = 0, e = Use.getNumOperands(); i < e; ++i) if (Use.getOperand(i).isReg() && @@ -566,8 +565,8 @@ bool PPCMIPeephole::simplifyCode(void) { case PPC::EXTSH8: case PPC::EXTSH8_32_64: { if (!EnableSExtElimination) break; - unsigned NarrowReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(NarrowReg)) + Register NarrowReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(NarrowReg)) break; MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg); @@ -610,8 +609,8 @@ bool PPCMIPeephole::simplifyCode(void) { case PPC::EXTSW_32: case PPC::EXTSW_32_64: { if (!EnableSExtElimination) break; - unsigned NarrowReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(NarrowReg)) + Register NarrowReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(NarrowReg)) break; MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg); @@ -652,8 +651,8 @@ bool PPCMIPeephole::simplifyCode(void) { // We can eliminate EXTSW if the input is known to be already // sign-extended. LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n"); - unsigned TmpReg = - MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass); + Register TmpReg = + MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF), TmpReg); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::INSERT_SUBREG), @@ -679,8 +678,8 @@ bool PPCMIPeephole::simplifyCode(void) { if (MI.getOperand(2).getImm() != 0) break; - unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + Register SrcReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(SrcReg)) break; MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); @@ -695,8 +694,8 @@ bool PPCMIPeephole::simplifyCode(void) { SrcMI = SubRegMI; if (SubRegMI->getOpcode() == PPC::COPY) { - unsigned CopyReg = SubRegMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(CopyReg)) + Register CopyReg = SubRegMI->getOperand(1).getReg(); + if (Register::isVirtualRegister(CopyReg)) SrcMI = MRI->getVRegDef(CopyReg); } @@ -757,7 +756,7 @@ bool PPCMIPeephole::simplifyCode(void) { break; // We don't have an ADD fed by LI's that can be transformed // Now we know that Op1 is the PHI node and Op2 is the dominator - unsigned DominatorReg = Op2.getReg(); + Register DominatorReg = Op2.getReg(); const TargetRegisterClass *TRC = MI.getOpcode() == PPC::ADD8 ? &PPC::G8RC_and_G8RC_NOX0RegClass @@ -927,7 +926,7 @@ static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1, } else if (Inst->isFullCopy()) NextReg = Inst->getOperand(1).getReg(); - if (NextReg == SrcReg || !TargetRegisterInfo::isVirtualRegister(NextReg)) + if (NextReg == SrcReg || !Register::isVirtualRegister(NextReg)) break; SrcReg = NextReg; } @@ -949,9 +948,8 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB, (*BII).getOpcode() == PPC::BCC && (*BII).getOperand(1).isReg()) { // We optimize only if the condition code is used only by one BCC. - unsigned CndReg = (*BII).getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(CndReg) || - !MRI->hasOneNonDBGUse(CndReg)) + Register CndReg = (*BII).getOperand(1).getReg(); + if (!Register::isVirtualRegister(CndReg) || !MRI->hasOneNonDBGUse(CndReg)) return false; MachineInstr *CMPI = MRI->getVRegDef(CndReg); @@ -961,7 +959,7 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB, // We skip this BB if a physical register is used in comparison. for (MachineOperand &MO : CMPI->operands()) - if (MO.isReg() && !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (MO.isReg() && !Register::isVirtualRegister(MO.getReg())) return false; return true; @@ -1271,8 +1269,8 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) { // We touch up the compare instruction in MBB2 and move it to // a previous BB to handle partially redundant case. if (SwapOperands) { - unsigned Op1 = CMPI2->getOperand(1).getReg(); - unsigned Op2 = CMPI2->getOperand(2).getReg(); + Register Op1 = CMPI2->getOperand(1).getReg(); + Register Op2 = CMPI2->getOperand(2).getReg(); CMPI2->getOperand(1).setReg(Op2); CMPI2->getOperand(2).setReg(Op1); } @@ -1295,7 +1293,7 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) { MBBtoMoveCmp->splice(I, &MBB2, MachineBasicBlock::iterator(CMPI2)); DebugLoc DL = CMPI2->getDebugLoc(); - unsigned NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass); + Register NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass); BuildMI(MBB2, MBB2.begin(), DL, TII->get(PPC::PHI), NewVReg) .addReg(BI1->getOperand(1).getReg()).addMBB(MBB1) @@ -1334,8 +1332,8 @@ bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) { if (MI.getOpcode() != PPC::RLDICR) return false; - unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + Register SrcReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(SrcReg)) return false; MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); @@ -1414,8 +1412,8 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI, if (SHMI + MEMI != 63) return false; - unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + Register SrcReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(SrcReg)) return false; MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); @@ -1428,6 +1426,12 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI, if (!MRI->hasOneNonDBGUse(SrcReg)) return false; + assert(SrcMI->getNumOperands() == 2 && "EXTSW should have 2 operands"); + assert(SrcMI->getOperand(1).isReg() && + "EXTSW's second operand should be a register"); + if (!Register::isVirtualRegister(SrcMI->getOperand(1).getReg())) + return false; + LLVM_DEBUG(dbgs() << "Combining pair: "); LLVM_DEBUG(SrcMI->dump()); LLVM_DEBUG(MI.dump()); diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp index d83c92276800..b1c0433641dd 100644 --- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -57,6 +57,109 @@ namespace { MachineFunctionProperties::Property::NoVRegs); } + // This function removes any redundant load immediates. It has two level + // loops - The outer loop finds the load immediates BBI that could be used + // to replace following redundancy. The inner loop scans instructions that + // after BBI to find redundancy and update kill/dead flags accordingly. If + // AfterBBI is the same as BBI, it is redundant, otherwise any instructions + // that modify the def register of BBI would break the scanning. + // DeadOrKillToUnset is a pointer to the previous operand that had the + // kill/dead flag set. It keeps track of the def register of BBI, the use + // registers of AfterBBIs and the def registers of AfterBBIs. + bool removeRedundantLIs(MachineBasicBlock &MBB, + const TargetRegisterInfo *TRI) { + LLVM_DEBUG(dbgs() << "Remove redundant load immediates from MBB:\n"; + MBB.dump(); dbgs() << "\n"); + + DenseSet<MachineInstr *> InstrsToErase; + for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) { + // Skip load immediate that is marked to be erased later because it + // cannot be used to replace any other instructions. + if (InstrsToErase.find(&*BBI) != InstrsToErase.end()) + continue; + // Skip non-load immediate. + unsigned Opc = BBI->getOpcode(); + if (Opc != PPC::LI && Opc != PPC::LI8 && Opc != PPC::LIS && + Opc != PPC::LIS8) + continue; + // Skip load immediate, where the operand is a relocation (e.g., $r3 = + // LI target-flags(ppc-lo) %const.0). + if (!BBI->getOperand(1).isImm()) + continue; + assert(BBI->getOperand(0).isReg() && + "Expected a register for the first operand"); + + LLVM_DEBUG(dbgs() << "Scanning after load immediate: "; BBI->dump();); + + Register Reg = BBI->getOperand(0).getReg(); + int64_t Imm = BBI->getOperand(1).getImm(); + MachineOperand *DeadOrKillToUnset = nullptr; + if (BBI->getOperand(0).isDead()) { + DeadOrKillToUnset = &BBI->getOperand(0); + LLVM_DEBUG(dbgs() << " Kill flag of " << *DeadOrKillToUnset + << " from load immediate " << *BBI + << " is a unsetting candidate\n"); + } + // This loop scans instructions after BBI to see if there is any + // redundant load immediate. + for (auto AfterBBI = std::next(BBI); AfterBBI != MBB.instr_end(); + ++AfterBBI) { + // Track the operand that kill Reg. We would unset the kill flag of + // the operand if there is a following redundant load immediate. + int KillIdx = AfterBBI->findRegisterUseOperandIdx(Reg, true, TRI); + if (KillIdx != -1) { + assert(!DeadOrKillToUnset && "Shouldn't kill same register twice"); + DeadOrKillToUnset = &AfterBBI->getOperand(KillIdx); + LLVM_DEBUG(dbgs() + << " Kill flag of " << *DeadOrKillToUnset << " from " + << *AfterBBI << " is a unsetting candidate\n"); + } + + if (!AfterBBI->modifiesRegister(Reg, TRI)) + continue; + // Finish scanning because Reg is overwritten by a non-load + // instruction. + if (AfterBBI->getOpcode() != Opc) + break; + assert(AfterBBI->getOperand(0).isReg() && + "Expected a register for the first operand"); + // Finish scanning because Reg is overwritten by a relocation or a + // different value. + if (!AfterBBI->getOperand(1).isImm() || + AfterBBI->getOperand(1).getImm() != Imm) + break; + + // It loads same immediate value to the same Reg, which is redundant. + // We would unset kill flag in previous Reg usage to extend live range + // of Reg first, then remove the redundancy. + if (DeadOrKillToUnset) { + LLVM_DEBUG(dbgs() + << " Unset dead/kill flag of " << *DeadOrKillToUnset + << " from " << *DeadOrKillToUnset->getParent()); + if (DeadOrKillToUnset->isDef()) + DeadOrKillToUnset->setIsDead(false); + else + DeadOrKillToUnset->setIsKill(false); + } + DeadOrKillToUnset = + AfterBBI->findRegisterDefOperand(Reg, true, true, TRI); + if (DeadOrKillToUnset) + LLVM_DEBUG(dbgs() + << " Dead flag of " << *DeadOrKillToUnset << " from " + << *AfterBBI << " is a unsetting candidate\n"); + InstrsToErase.insert(&*AfterBBI); + LLVM_DEBUG(dbgs() << " Remove redundant load immediate: "; + AfterBBI->dump()); + } + } + + for (MachineInstr *MI : InstrsToErase) { + MI->eraseFromParent(); + } + NumRemovedInPreEmit += InstrsToErase.size(); + return !InstrsToErase.empty(); + } + bool runOnMachineFunction(MachineFunction &MF) override { if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) return false; @@ -65,6 +168,7 @@ namespace { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); SmallVector<MachineInstr *, 4> InstrsToErase; for (MachineBasicBlock &MBB : MF) { + Changed |= removeRedundantLIs(MBB, TRI); for (MachineInstr &MI : MBB) { unsigned Opc = MI.getOpcode(); // Detect self copies - these can result from running AADB. @@ -111,7 +215,7 @@ namespace { if (Br->getOpcode() != PPC::BC && Br->getOpcode() != PPC::BCn) continue; MachineInstr *CRSetMI = nullptr; - unsigned CRBit = Br->getOperand(0).getReg(); + Register CRBit = Br->getOperand(0).getReg(); unsigned CRReg = getCRFromCRBit(CRBit); bool SeenUse = false; MachineBasicBlock::reverse_iterator It = Br, Er = MBB.rend(); diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp index 3a83cc27439c..6e9042643820 100644 --- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp @@ -79,8 +79,8 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { for (auto SI = Splats.begin(); SI != Splats.end();) { MachineInstr *SMI = *SI; - unsigned SplatReg = SMI->getOperand(0).getReg(); - unsigned SrcReg = SMI->getOperand(1).getReg(); + Register SplatReg = SMI->getOperand(0).getReg(); + Register SrcReg = SMI->getOperand(1).getReg(); if (MI->modifiesRegister(SrcReg, TRI)) { switch (MI->getOpcode()) { @@ -102,7 +102,7 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { // the QPX splat source register. unsigned SubRegIndex = TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); - unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); + Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); // Substitute both the explicit defined register, and also the // implicit def of the containing QPX register. diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 8eaa6dfe2bf7..3b71ed219c17 100644 --- a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -381,10 +381,10 @@ private: const MachineBranchProbabilityInfo *MBPI; // A vector to contain all the CR logical operations - std::vector<CRLogicalOpInfo> AllCRLogicalOps; + SmallVector<CRLogicalOpInfo, 16> AllCRLogicalOps; void initialize(MachineFunction &MFParm); void collectCRLogicals(); - bool handleCROp(CRLogicalOpInfo &CRI); + bool handleCROp(unsigned Idx); bool splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI); static bool isCRLogical(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); @@ -398,7 +398,7 @@ private: // Not using a range-based for loop here as the vector may grow while being // operated on. for (unsigned i = 0; i < AllCRLogicalOps.size(); i++) - Changed |= handleCROp(AllCRLogicalOps[i]); + Changed |= handleCROp(i); return Changed; } @@ -535,15 +535,15 @@ MachineInstr *PPCReduceCRLogicals::lookThroughCRCopy(unsigned Reg, unsigned &Subreg, MachineInstr *&CpDef) { Subreg = -1; - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return nullptr; MachineInstr *Copy = MRI->getVRegDef(Reg); CpDef = Copy; if (!Copy->isCopy()) return Copy; - unsigned CopySrc = Copy->getOperand(1).getReg(); + Register CopySrc = Copy->getOperand(1).getReg(); Subreg = Copy->getOperand(1).getSubReg(); - if (!TargetRegisterInfo::isVirtualRegister(CopySrc)) { + if (!Register::isVirtualRegister(CopySrc)) { const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); // Set the Subreg if (CopySrc == PPC::CR0EQ || CopySrc == PPC::CR6EQ) @@ -578,10 +578,11 @@ void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) { /// a unary CR logical might be used to change the condition code on a /// comparison feeding it. A nullary CR logical might simply be removable /// if the user of the bit it [un]sets can be transformed. -bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) { +bool PPCReduceCRLogicals::handleCROp(unsigned Idx) { // We can definitely split a block on the inputs to a binary CR operation // whose defs and (single) use are within the same block. bool Changed = false; + CRLogicalOpInfo CRI = AllCRLogicalOps[Idx]; if (CRI.IsBinary && CRI.ContainedInBlock && CRI.SingleUse && CRI.FeedsBR && CRI.DefsSingleUse) { Changed = splitBlockOnBinaryCROp(CRI); diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 12554ea8d079..9ec26a19bdaa 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -325,13 +325,13 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { bool IsPositionIndependent = TM.isPositionIndependent(); if (hasBasePointer(MF)) { - if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent) + if (Subtarget.is32BitELFABI() && IsPositionIndependent) markSuperRegs(Reserved, PPC::R29); else markSuperRegs(Reserved, PPC::R30); } - if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent) + if (Subtarget.is32BitELFABI() && IsPositionIndependent) markSuperRegs(Reserved, PPC::R30); // Reserve Altivec registers when Altivec is unavailable. @@ -391,7 +391,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const { - assert(TargetRegisterInfo::isPhysicalRegister(PhysReg)); + assert(Register::isPhysicalRegister(PhysReg)); const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); if (!TM.isPPC64()) @@ -425,7 +425,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case PPC::G8RC_NOX0RegClassID: case PPC::GPRC_NOR0RegClassID: case PPC::SPERCRegClassID: - case PPC::SPE4RCRegClassID: case PPC::G8RCRegClassID: case PPC::GPRCRegClassID: { unsigned FP = TFI->hasFP(MF) ? 1 : 0; @@ -527,7 +526,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { // Fortunately, a frame greater than 32K is rare. const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { if (LP64) @@ -549,7 +548,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { } bool KillNegSizeReg = MI.getOperand(1).isKill(); - unsigned NegSizeReg = MI.getOperand(1).getReg(); + Register NegSizeReg = MI.getOperand(1).getReg(); // Grow the stack and update the stack pointer link, then determine the // address of new allocated space. @@ -655,8 +654,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II, const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - unsigned SrcReg = MI.getOperand(0).getReg(); + Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + Register SrcReg = MI.getOperand(0).getReg(); // We need to store the CR in the low 4-bits of the saved value. First, issue // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg. @@ -700,8 +699,8 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II, const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - unsigned DestReg = MI.getOperand(0).getReg(); + Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + Register DestReg = MI.getOperand(0).getReg(); assert(MI.definesRegister(DestReg) && "RESTORE_CR does not define its destination"); @@ -744,8 +743,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - unsigned SrcReg = MI.getOperand(0).getReg(); + Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + Register SrcReg = MI.getOperand(0).getReg(); // Search up the BB to find the definition of the CR bit. MachineBasicBlock::reverse_iterator Ins; @@ -823,8 +822,8 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - unsigned DestReg = MI.getOperand(0).getReg(); + Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + Register DestReg = MI.getOperand(0).getReg(); assert(MI.definesRegister(DestReg) && "RESTORE_CRBIT does not define its destination"); @@ -833,7 +832,7 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg); - unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + Register RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO) .addReg(getCRFromCRBit(DestReg)); @@ -870,8 +869,8 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, DebugLoc dl = MI.getDebugLoc(); const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC); - unsigned SrcReg = MI.getOperand(0).getReg(); + Register Reg = MF.getRegInfo().createVirtualRegister(GPRC); + Register SrcReg = MI.getOperand(0).getReg(); BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg) .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); @@ -896,8 +895,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, DebugLoc dl = MI.getDebugLoc(); const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC); - unsigned DestReg = MI.getOperand(0).getReg(); + Register Reg = MF.getRegInfo().createVirtualRegister(GPRC); + Register DestReg = MI.getOperand(0).getReg(); assert(MI.definesRegister(DestReg) && "RESTORE_VRSAVE does not define its destination"); @@ -1128,7 +1127,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, OperandBase = OffsetOperandNo; } - unsigned StackReg = MI.getOperand(FIOperandNum).getReg(); + Register StackReg = MI.getOperand(FIOperandNum).getReg(); MI.getOperand(OperandBase).ChangeToRegister(StackReg, false); MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true); } diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td index af0dff6347a6..4719e947b172 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -253,15 +253,14 @@ def RM: PPCReg<"**ROUNDING MODE**">; /// Register classes // Allocate volatiles first // then nonvolatiles in reverse order since stmw/lmw save from rN to r31 -def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12), - (sequence "R%u", 30, 13), - R31, R0, R1, FP, BP)> { +def GPRC : RegisterClass<"PPC", [i32,f32], 32, (add (sequence "R%u", 2, 12), + (sequence "R%u", 30, 13), + R31, R0, R1, FP, BP)> { // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so // put it at the end of the list. let AltOrders = [(add (sub GPRC, R2), R2)]; let AltOrderSelect = [{ - const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); - return S.isPPC64() && S.isSVR4ABI(); + return MF.getSubtarget<PPCSubtarget>().is64BitELFABI(); }]; } @@ -272,21 +271,19 @@ def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12), // put it at the end of the list. let AltOrders = [(add (sub G8RC, X2), X2)]; let AltOrderSelect = [{ - const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); - return S.isPPC64() && S.isSVR4ABI(); + return MF.getSubtarget<PPCSubtarget>().is64BitELFABI(); }]; } // For some instructions r0 is special (representing the value 0 instead of // the value in the r0 register), and we use these register subclasses to // prevent r0 from being allocated for use by those instructions. -def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> { +def GPRC_NOR0 : RegisterClass<"PPC", [i32,f32], 32, (add (sub GPRC, R0), ZERO)> { // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so // put it at the end of the list. let AltOrders = [(add (sub GPRC_NOR0, R2), R2)]; let AltOrderSelect = [{ - const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); - return S.isPPC64() && S.isSVR4ABI(); + return MF.getSubtarget<PPCSubtarget>().is64BitELFABI(); }]; } @@ -295,8 +292,7 @@ def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> { // put it at the end of the list. let AltOrders = [(add (sub G8RC_NOX0, X2), X2)]; let AltOrderSelect = [{ - const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); - return S.isPPC64() && S.isSVR4ABI(); + return MF.getSubtarget<PPCSubtarget>().is64BitELFABI(); }]; } @@ -304,8 +300,6 @@ def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12), (sequence "S%u", 30, 13), S31, S0, S1)>; -def SPE4RC : RegisterClass<"PPC", [f32], 32, (add GPRC)>; - // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4 // ABI the size of the Floating-point register save area is determined by the // allocated non-volatile register with the lowest register number, as FP diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index 6aa7528634d3..10568ed4b655 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -60,7 +60,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, InstrInfo(*this), TLInfo(TM, *this) {} void PPCSubtarget::initializeEnvironment() { - StackAlignment = 16; + StackAlignment = Align(16); DarwinDirective = PPC::DIR_NONE; HasMFOCRF = false; Has64BitSupport = false; @@ -145,7 +145,8 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isDarwin()) HasLazyResolverStubs = true; - if (TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() || + if ((TargetTriple.isOSFreeBSD() && TargetTriple.getOSMajorVersion() >= 13) || + TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() || TargetTriple.isMusl()) SecurePlt = true; @@ -228,18 +229,13 @@ bool PPCSubtarget::enableSubRegLiveness() const { return UseSubRegLiveness; } -unsigned char -PPCSubtarget::classifyGlobalReference(const GlobalValue *GV) const { - // Note that currently we don't generate non-pic references. - // If a caller wants that, this will have to be updated. - +bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const { // Large code model always uses the TOC even for local symbols. if (TM.getCodeModel() == CodeModel::Large) - return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; - + return true; if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) - return PPCII::MO_PIC_FLAG; - return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; + return false; + return true; } bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 55fec1cb6d99..d96c2893aee9 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -78,7 +78,7 @@ protected: /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. - unsigned StackAlignment; + Align StackAlignment; /// Selected instruction itineraries (one entry per itinerary class.) InstrItineraryData InstrItins; @@ -166,7 +166,7 @@ public: /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. - unsigned getStackAlignment() const { return StackAlignment; } + Align getStackAlignment() const { return StackAlignment; } /// getDarwinDirective - Returns the -m directive specified for the cpu. /// @@ -210,7 +210,11 @@ public: /// instructions, regardless of whether we are in 32-bit or 64-bit mode. bool has64BitSupport() const { return Has64BitSupport; } // useSoftFloat - Return true if soft-float option is turned on. - bool useSoftFloat() const { return !HasHardFloat; } + bool useSoftFloat() const { + if (isAIXABI() && !HasHardFloat) + report_fatal_error("soft-float is not yet supported on AIX."); + return !HasHardFloat; + } /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit /// registers in 32-bit mode when possible. This can only true if @@ -277,11 +281,11 @@ public: bool hasDirectMove() const { return HasDirectMove; } bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } - unsigned getPlatformStackAlignment() const { + Align getPlatformStackAlignment() const { if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) - return 32; + return Align(32); - return 16; + return Align(16); } // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no @@ -316,6 +320,9 @@ public: bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); } bool isELFv2ABI() const; + bool is64BitELFABI() const { return isSVR4ABI() && isPPC64(); } + bool is32BitELFABI() const { return isSVR4ABI() && !isPPC64(); } + /// Originally, this function return hasISEL(). Now we always enable it, /// but may expand the ISEL instruction later. bool enableEarlyIfConversion() const override { return true; } @@ -337,9 +344,8 @@ public: bool enableSubRegLiveness() const override; - /// classifyGlobalReference - Classify a global variable reference for the - /// current subtarget accourding to how we should reference it. - unsigned char classifyGlobalReference(const GlobalValue *GV) const; + /// True if the GV will be accessed via an indirect symbol. + bool isGVIndirectSymbol(const GlobalValue *GV) const; bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; } }; diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index fb826c4a32f1..8f313d9d01c4 100644 --- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -74,8 +74,8 @@ protected: LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI); - unsigned OutReg = MI.getOperand(0).getReg(); - unsigned InReg = MI.getOperand(1).getReg(); + Register OutReg = MI.getOperand(0).getReg(); + Register InReg = MI.getOperand(1).getReg(); DebugLoc DL = MI.getDebugLoc(); unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3; unsigned Opc1, Opc2; diff --git a/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/lib/Target/PowerPC/PPCTOCRegDeps.cpp index 3eb0569fb955..895ae6744421 100644 --- a/lib/Target/PowerPC/PPCTOCRegDeps.cpp +++ b/lib/Target/PowerPC/PPCTOCRegDeps.cpp @@ -95,7 +95,8 @@ namespace { protected: bool hasTOCLoReloc(const MachineInstr &MI) { if (MI.getOpcode() == PPC::LDtocL || - MI.getOpcode() == PPC::ADDItocL) + MI.getOpcode() == PPC::ADDItocL || + MI.getOpcode() == PPC::LWZtocL) return true; for (const MachineOperand &MO : MI.operands()) { @@ -109,11 +110,15 @@ protected: bool processBlock(MachineBasicBlock &MBB) { bool Changed = false; + const bool isPPC64 = + MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64(); + const unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2; + for (auto &MI : MBB) { if (!hasTOCLoReloc(MI)) continue; - MI.addOperand(MachineOperand::CreateReg(PPC::X2, + MI.addOperand(MachineOperand::CreateReg(TOCReg, false /*IsDef*/, true /*IsImp*/)); Changed = true; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index ce00f848dd72..abefee8b339d 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -93,7 +93,7 @@ EnableMachineCombinerPass("ppc-machine-combiner", static cl::opt<bool> ReduceCRLogical("ppc-reduce-cr-logicals", cl::desc("Expand eligible cr-logical binary ops to branches"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); extern "C" void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target()); @@ -185,12 +185,13 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - // If it isn't a Mach-O file then it's going to be a linux ELF - // object file. if (TT.isOSDarwin()) - return llvm::make_unique<TargetLoweringObjectFileMachO>(); + return std::make_unique<TargetLoweringObjectFileMachO>(); + + if (TT.isOSAIX()) + return std::make_unique<TargetLoweringObjectFileXCOFF>(); - return llvm::make_unique<PPC64LinuxTargetObjectFile>(); + return std::make_unique<PPC64LinuxTargetObjectFile>(); } static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, @@ -248,10 +249,19 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT, report_fatal_error("Target does not support the kernel CodeModel", false); return *CM; } - if (!TT.isOSDarwin() && !JIT && - (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)) - return CodeModel::Medium; - return CodeModel::Small; + + if (JIT) + return CodeModel::Small; + if (TT.isOSAIX()) + return CodeModel::Small; + + assert(TT.isOSBinFormatELF() && "All remaining PPC OSes are ELF based."); + + if (TT.isArch32Bit()) + return CodeModel::Small; + + assert(TT.isArch64Bit() && "Unsupported PPC architecture."); + return CodeModel::Medium; } @@ -259,8 +269,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) { const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>(); ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ? - llvm::make_unique<PPCPreRASchedStrategy>(C) : - llvm::make_unique<GenericScheduler>(C)); + std::make_unique<PPCPreRASchedStrategy>(C) : + std::make_unique<GenericScheduler>(C)); // add DAG Mutations here. DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); return DAG; @@ -271,8 +281,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler( const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>(); ScheduleDAGMI *DAG = new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ? - llvm::make_unique<PPCPostRASchedStrategy>(C) : - llvm::make_unique<PostGenericScheduler>(C), true); + std::make_unique<PPCPostRASchedStrategy>(C) : + std::make_unique<PostGenericScheduler>(C), true); // add DAG Mutations here. return DAG; } @@ -328,7 +338,7 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<PPCSubtarget>( + I = std::make_unique<PPCSubtarget>( TargetTriple, CPU, // FIXME: It would be good to have the subtarget additions here // not necessary. Anything that turns them on/off (overrides) ends diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index ff3dfbfaca05..f51300c656aa 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -594,10 +594,37 @@ bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } -unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { - if (Vector && !ST->hasAltivec() && !ST->hasQPX()) - return 0; - return ST->hasVSX() ? 64 : 32; +unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const { + assert(ClassID == GPRRC || ClassID == FPRRC || + ClassID == VRRC || ClassID == VSXRC); + if (ST->hasVSX()) { + assert(ClassID == GPRRC || ClassID == VSXRC); + return ClassID == GPRRC ? 32 : 64; + } + assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC); + return 32; +} + +unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const { + if (Vector) + return ST->hasVSX() ? VSXRC : VRRC; + else if (Ty && Ty->getScalarType()->isFloatTy()) + return ST->hasVSX() ? VSXRC : FPRRC; + else + return GPRRC; +} + +const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { + + switch (ClassID) { + default: + llvm_unreachable("unknown register class"); + return "PPC::unknown register class"; + case GPRRC: return "PPC::GPRRC"; + case FPRRC: return "PPC::FPRRC"; + case VRRC: return "PPC::VRRC"; + case VSXRC: return "PPC::VSXRC"; + } } unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { @@ -613,7 +640,7 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { } -unsigned PPCTTIImpl::getCacheLineSize() { +unsigned PPCTTIImpl::getCacheLineSize() const { // Check first if the user specified a custom line size. if (CacheLineSize.getNumOccurrences() > 0) return CacheLineSize; @@ -628,7 +655,7 @@ unsigned PPCTTIImpl::getCacheLineSize() { return 64; } -unsigned PPCTTIImpl::getPrefetchDistance() { +unsigned PPCTTIImpl::getPrefetchDistance() const { // This seems like a reasonable default for the BG/Q (this pass is enabled, by // default, only on the BG/Q). return 300; @@ -752,6 +779,35 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return 0; return Cost; + + } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { + if (ST->hasP9Altivec()) { + if (ISD == ISD::INSERT_VECTOR_ELT) + // A move-to VSR and a permute/insert. Assume vector operation cost + // for both (cost will be 2x on P9). + return vectorCostAdjustment(2, Opcode, Val, nullptr); + + // It's an extract. Maybe we can do a cheap move-from VSR. + unsigned EltSize = Val->getScalarSizeInBits(); + if (EltSize == 64) { + unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0; + if (Index == MfvsrdIndex) + return 1; + } else if (EltSize == 32) { + unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1; + if (Index == MfvsrwzIndex) + return 1; + } + + // We need a vector extract (or mfvsrld). Assume vector operation cost. + // The cost of the load constant for a vector extract is disregarded + // (invariant, easily schedulable). + return vectorCostAdjustment(1, Opcode, Val, nullptr); + + } else if (ST->hasDirectMove()) + // Assume permute has standard cost. + // Assume move-to/move-from VSR have 2x standard cost. + return 3; } // Estimated cost of a load-hit-store delay. This was obtained diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 5d76ee418b69..83a70364bf68 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -72,10 +72,16 @@ public: TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); - unsigned getNumberOfRegisters(bool Vector); + + enum PPCRegisterClass { + GPRRC, FPRRC, VRRC, VSXRC + }; + unsigned getNumberOfRegisters(unsigned ClassID) const; + unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; + const char* getRegisterClassName(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; - unsigned getCacheLineSize(); - unsigned getPrefetchDistance(); + unsigned getCacheLineSize() const override; + unsigned getPrefetchDistance() const override; unsigned getMaxInterleaveFactor(unsigned VF); int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2); int getArithmeticInstrCost( diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp index 719ed7b63878..3463bbbdc5f0 100644 --- a/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -50,7 +50,7 @@ namespace { bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { return RC->hasSubClassEq(MRI.getRegClass(Reg)); } else if (RC->contains(Reg)) { return true; @@ -102,7 +102,7 @@ protected: IsVSFReg(SrcMO.getReg(), MRI)) && "Unknown source for a VSX copy"); - unsigned NewVReg = MRI.createVirtualRegister(SrcRC); + Register NewVReg = MRI.createVirtualRegister(SrcRC); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg) .addImm(1) // add 1, not 0, because there is no implicit clearing @@ -124,7 +124,7 @@ protected: "Unknown destination for a VSX copy"); // Copy the VSX value into a new VSX register of the correct subclass. - unsigned NewVReg = MRI.createVirtualRegister(DstRC); + Register NewVReg = MRI.createVirtualRegister(DstRC); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), NewVReg) .add(SrcMO); diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index ce78239df0a8..5e150be544ed 100644 --- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -126,8 +126,8 @@ protected: if (!AddendMI->isFullCopy()) continue; - unsigned AddendSrcReg = AddendMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) { + Register AddendSrcReg = AddendMI->getOperand(1).getReg(); + if (Register::isVirtualRegister(AddendSrcReg)) { if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) != MRI.getRegClass(AddendSrcReg)) continue; @@ -182,12 +182,12 @@ protected: // %5 = A-form-op %5, %5, %11; // where %5 and %11 are both kills. This case would be skipped // otherwise. - unsigned OldFMAReg = MI.getOperand(0).getReg(); + Register OldFMAReg = MI.getOperand(0).getReg(); // Find one of the product operands that is killed by this instruction. unsigned KilledProdOp = 0, OtherProdOp = 0; - unsigned Reg2 = MI.getOperand(2).getReg(); - unsigned Reg3 = MI.getOperand(3).getReg(); + Register Reg2 = MI.getOperand(2).getReg(); + Register Reg3 = MI.getOperand(3).getReg(); if (LIS->getInterval(Reg2).Query(FMAIdx).isKill() && Reg2 != OldFMAReg) { KilledProdOp = 2; @@ -208,14 +208,14 @@ protected: // legality checks above, the live range for the addend source register // could be extended), but it seems likely that such a trivial copy can // be coalesced away later, and thus is not worth the effort. - if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) && + if (Register::isVirtualRegister(AddendSrcReg) && !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) continue; // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3. - unsigned KilledProdReg = MI.getOperand(KilledProdOp).getReg(); - unsigned OtherProdReg = MI.getOperand(OtherProdOp).getReg(); + Register KilledProdReg = MI.getOperand(KilledProdOp).getReg(); + Register OtherProdReg = MI.getOperand(OtherProdOp).getReg(); unsigned AddSubReg = AddendMI->getOperand(1).getSubReg(); unsigned KilledProdSubReg = MI.getOperand(KilledProdOp).getSubReg(); @@ -314,7 +314,7 @@ protected: // Extend the live interval of the addend source (it might end at the // copy to be removed, or somewhere in between there and here). This // is necessary only if it is a physical register. - if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) + if (!Register::isVirtualRegister(AddendSrcReg)) for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid(); ++Units) { unsigned Unit = *Units; diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 44175af7f9b6..c3729da0b07b 100644 --- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -158,7 +158,7 @@ private: // Return true iff the given register is in the given class. bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return RC->hasSubClassEq(MRI->getRegClass(Reg)); return RC->contains(Reg); } @@ -253,7 +253,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (isAnyVecReg(Reg, Partial)) { RelevantInstr = true; break; @@ -566,7 +566,7 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, CopySrcReg = MI->getOperand(2).getReg(); } - if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { + if (!Register::isVirtualRegister(CopySrcReg)) { if (!isScalarVecReg(CopySrcReg)) SwapVector[VecIdx].MentionsPhysVR = 1; return CopySrcReg; @@ -601,11 +601,11 @@ void PPCVSXSwapRemoval::formWebs() { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!isVecReg(Reg) && !isScalarVecReg(Reg)) continue; - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Register::isVirtualRegister(Reg)) { if (!(MI->isCopy() && isScalarVecReg(Reg))) SwapVector[EntryIdx].MentionsPhysVR = 1; continue; @@ -667,7 +667,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { // than a swap instruction. else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; - unsigned DefReg = MI->getOperand(0).getReg(); + Register DefReg = MI->getOperand(0).getReg(); // We skip debug instructions in the analysis. (Note that debug // location information is still maintained by this optimization @@ -695,9 +695,9 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { // other than a swap instruction. } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; - unsigned UseReg = MI->getOperand(0).getReg(); + Register UseReg = MI->getOperand(0).getReg(); MachineInstr *DefMI = MRI->getVRegDef(UseReg); - unsigned DefReg = DefMI->getOperand(0).getReg(); + Register DefReg = DefMI->getOperand(0).getReg(); int DefIdx = SwapMap[DefMI]; if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad || @@ -756,7 +756,7 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() { if (!SwapVector[Repr].WebRejected) { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; - unsigned DefReg = MI->getOperand(0).getReg(); + Register DefReg = MI->getOperand(0).getReg(); for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { int UseIdx = SwapMap[&UseMI]; @@ -772,7 +772,7 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() { if (!SwapVector[Repr].WebRejected) { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; - unsigned UseReg = MI->getOperand(0).getReg(); + Register UseReg = MI->getOperand(0).getReg(); MachineInstr *DefMI = MRI->getVRegDef(UseReg); int DefIdx = SwapMap[DefMI]; SwapVector[DefIdx].WillRemove = 1; @@ -869,8 +869,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { Selector = 3 - Selector; MI->getOperand(3).setImm(Selector); - unsigned Reg1 = MI->getOperand(1).getReg(); - unsigned Reg2 = MI->getOperand(2).getReg(); + Register Reg1 = MI->getOperand(1).getReg(); + Register Reg2 = MI->getOperand(2).getReg(); MI->getOperand(1).setReg(Reg2); MI->getOperand(2).setReg(Reg1); @@ -894,9 +894,9 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: "); LLVM_DEBUG(MI->dump()); - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); - unsigned NewVReg = MRI->createVirtualRegister(DstRC); + Register NewVReg = MRI->createVirtualRegister(DstRC); MI->getOperand(0).setReg(NewVReg); LLVM_DEBUG(dbgs() << " Into: "); @@ -910,8 +910,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { // prior to the swap, and from VSRC to VRRC following the swap. // Coalescing will usually remove all this mess. if (DstRC == &PPC::VRRCRegClass) { - unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); - unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); + Register VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); + Register VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), TII->get(PPC::COPY), VSRCTmp1) diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 0172c6298772..300ba8dc675c 100644 --- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/CodeGen/Register.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -79,7 +80,7 @@ class RISCVAsmParser : public MCTargetAsmParser { // Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that // synthesize the desired immedate value into the destination register. - void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out); + void emitLoadImm(Register DestReg, int64_t Value, MCStreamer &Out); // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement // helpers such as emitLoadLocalAddress and emitLoadAddress. @@ -127,6 +128,7 @@ class RISCVAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseRegister(OperandVector &Operands, bool AllowParens = false); OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands); + OperandMatchResultTy parseAtomicMemOp(OperandVector &Operands); OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands); OperandMatchResultTy parseBareSymbol(OperandVector &Operands); OperandMatchResultTy parseCallSymbol(OperandVector &Operands); @@ -193,7 +195,7 @@ public: /// instruction struct RISCVOperand : public MCParsedAsmOperand { - enum KindTy { + enum class KindTy { Token, Register, Immediate, @@ -203,7 +205,7 @@ struct RISCVOperand : public MCParsedAsmOperand { bool IsRV64; struct RegOp { - unsigned RegNum; + Register RegNum; }; struct ImmOp { @@ -235,26 +237,26 @@ public: StartLoc = o.StartLoc; EndLoc = o.EndLoc; switch (Kind) { - case Register: + case KindTy::Register: Reg = o.Reg; break; - case Immediate: + case KindTy::Immediate: Imm = o.Imm; break; - case Token: + case KindTy::Token: Tok = o.Tok; break; - case SystemRegister: + case KindTy::SystemRegister: SysReg = o.SysReg; break; } } - bool isToken() const override { return Kind == Token; } - bool isReg() const override { return Kind == Register; } - bool isImm() const override { return Kind == Immediate; } + bool isToken() const override { return Kind == KindTy::Token; } + bool isReg() const override { return Kind == KindTy::Register; } + bool isImm() const override { return Kind == KindTy::Immediate; } bool isMem() const override { return false; } - bool isSystemRegister() const { return Kind == SystemRegister; } + bool isSystemRegister() const { return Kind == KindTy::SystemRegister; } static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm, RISCVMCExpr::VariantKind &VK) { @@ -276,7 +278,7 @@ public: // modifiers and isShiftedInt<N-1, 1>(Op). template <int N> bool isBareSimmNLsb0() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; if (!isImm()) return false; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); @@ -292,7 +294,7 @@ public: bool isBareSymbol() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm, VK)) return false; @@ -302,7 +304,7 @@ public: bool isCallSymbol() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm, VK)) return false; @@ -313,7 +315,7 @@ public: bool isTPRelAddSymbol() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm, VK)) return false; @@ -364,7 +366,7 @@ public: bool isImmXLenLI() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; if (!isImm()) return false; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); @@ -372,13 +374,13 @@ public: return true; // Given only Imm, ensuring that the actually specified constant is either // a signed or unsigned 64-bit number is unfortunately impossible. - bool IsInRange = isRV64() ? true : isInt<32>(Imm) || isUInt<32>(Imm); - return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None; + return IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None && + (isRV64() || (isInt<32>(Imm) || isUInt<32>(Imm))); } bool isUImmLog2XLen() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; if (!isImm()) return false; if (!evaluateConstantImm(getImm(), Imm, VK) || @@ -389,7 +391,7 @@ public: bool isUImmLog2XLenNonZero() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; if (!isImm()) return false; if (!evaluateConstantImm(getImm(), Imm, VK) || @@ -402,7 +404,7 @@ public: bool isUImm5() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; if (!isImm()) return false; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); @@ -411,7 +413,7 @@ public: bool isUImm5NonZero() const { int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; if (!isImm()) return false; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); @@ -422,7 +424,7 @@ public: bool isSImm6() const { if (!isImm()) return false; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; int64_t Imm; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isInt<6>(Imm) && @@ -432,7 +434,7 @@ public: bool isSImm6NonZero() const { if (!isImm()) return false; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; int64_t Imm; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isInt<6>(Imm) && (Imm != 0) && @@ -443,7 +445,7 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && (Imm != 0) && (isUInt<5>(Imm) || (Imm >= 0xfffe0 && Imm <= 0xfffff)) && @@ -454,7 +456,7 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isShiftedUInt<5, 2>(Imm) && VK == RISCVMCExpr::VK_RISCV_None; @@ -464,7 +466,7 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isShiftedUInt<6, 2>(Imm) && VK == RISCVMCExpr::VK_RISCV_None; @@ -474,7 +476,7 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isShiftedUInt<5, 3>(Imm) && VK == RISCVMCExpr::VK_RISCV_None; @@ -486,7 +488,7 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isShiftedUInt<6, 3>(Imm) && VK == RISCVMCExpr::VK_RISCV_None; @@ -496,14 +498,14 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && isShiftedUInt<8, 2>(Imm) && (Imm != 0) && VK == RISCVMCExpr::VK_RISCV_None; } bool isSImm12() const { - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; int64_t Imm; bool IsValid; if (!isImm()) @@ -527,14 +529,14 @@ public: if (!isImm()) return false; int64_t Imm; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) && VK == RISCVMCExpr::VK_RISCV_None; } bool isUImm20LUI() const { - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; int64_t Imm; bool IsValid; if (!isImm()) @@ -552,7 +554,7 @@ public: } bool isUImm20AUIPC() const { - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; int64_t Imm; bool IsValid; if (!isImm()) @@ -575,6 +577,15 @@ public: bool isSImm21Lsb0JAL() const { return isBareSimmNLsb0<21>(); } + bool isImmZero() const { + if (!isImm()) + return false; + int64_t Imm; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + return IsConstantImm && (Imm == 0) && VK == RISCVMCExpr::VK_RISCV_None; + } + /// getStartLoc - Gets location of the first token of this operand SMLoc getStartLoc() const override { return StartLoc; } /// getEndLoc - Gets location of the last token of this operand @@ -583,38 +594,38 @@ public: bool isRV64() const { return IsRV64; } unsigned getReg() const override { - assert(Kind == Register && "Invalid type access!"); - return Reg.RegNum; + assert(Kind == KindTy::Register && "Invalid type access!"); + return Reg.RegNum.id(); } StringRef getSysReg() const { - assert(Kind == SystemRegister && "Invalid access!"); + assert(Kind == KindTy::SystemRegister && "Invalid access!"); return StringRef(SysReg.Data, SysReg.Length); } const MCExpr *getImm() const { - assert(Kind == Immediate && "Invalid type access!"); + assert(Kind == KindTy::Immediate && "Invalid type access!"); return Imm.Val; } StringRef getToken() const { - assert(Kind == Token && "Invalid type access!"); + assert(Kind == KindTy::Token && "Invalid type access!"); return Tok; } void print(raw_ostream &OS) const override { switch (Kind) { - case Immediate: + case KindTy::Immediate: OS << *getImm(); break; - case Register: + case KindTy::Register: OS << "<register x"; OS << getReg() << ">"; break; - case Token: + case KindTy::Token: OS << "'" << getToken() << "'"; break; - case SystemRegister: + case KindTy::SystemRegister: OS << "<sysreg: " << getSysReg() << '>'; break; } @@ -622,7 +633,7 @@ public: static std::unique_ptr<RISCVOperand> createToken(StringRef Str, SMLoc S, bool IsRV64) { - auto Op = make_unique<RISCVOperand>(Token); + auto Op = std::make_unique<RISCVOperand>(KindTy::Token); Op->Tok = Str; Op->StartLoc = S; Op->EndLoc = S; @@ -632,7 +643,7 @@ public: static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S, SMLoc E, bool IsRV64) { - auto Op = make_unique<RISCVOperand>(Register); + auto Op = std::make_unique<RISCVOperand>(KindTy::Register); Op->Reg.RegNum = RegNo; Op->StartLoc = S; Op->EndLoc = E; @@ -642,7 +653,7 @@ public: static std::unique_ptr<RISCVOperand> createImm(const MCExpr *Val, SMLoc S, SMLoc E, bool IsRV64) { - auto Op = make_unique<RISCVOperand>(Immediate); + auto Op = std::make_unique<RISCVOperand>(KindTy::Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -652,7 +663,7 @@ public: static std::unique_ptr<RISCVOperand> createSysReg(StringRef Str, SMLoc S, unsigned Encoding, bool IsRV64) { - auto Op = make_unique<RISCVOperand>(SystemRegister); + auto Op = std::make_unique<RISCVOperand>(KindTy::SystemRegister); Op->SysReg.Data = Str.data(); Op->SysReg.Length = Str.size(); Op->SysReg.Encoding = Encoding; @@ -664,7 +675,7 @@ public: void addExpr(MCInst &Inst, const MCExpr *Expr) const { assert(Expr && "Expr shouldn't be null!"); int64_t Imm = 0; - RISCVMCExpr::VariantKind VK; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; bool IsConstant = evaluateConstantImm(Expr, Imm, VK); if (IsConstant) @@ -730,46 +741,9 @@ public: #define GET_MATCHER_IMPLEMENTATION #include "RISCVGenAsmMatcher.inc" -// Return the matching FPR64 register for the given FPR32. -// FIXME: Ideally this function could be removed in favour of using -// information from TableGen. -unsigned convertFPR32ToFPR64(unsigned Reg) { - switch (Reg) { - default: - llvm_unreachable("Not a recognised FPR32 register"); - case RISCV::F0_32: return RISCV::F0_64; - case RISCV::F1_32: return RISCV::F1_64; - case RISCV::F2_32: return RISCV::F2_64; - case RISCV::F3_32: return RISCV::F3_64; - case RISCV::F4_32: return RISCV::F4_64; - case RISCV::F5_32: return RISCV::F5_64; - case RISCV::F6_32: return RISCV::F6_64; - case RISCV::F7_32: return RISCV::F7_64; - case RISCV::F8_32: return RISCV::F8_64; - case RISCV::F9_32: return RISCV::F9_64; - case RISCV::F10_32: return RISCV::F10_64; - case RISCV::F11_32: return RISCV::F11_64; - case RISCV::F12_32: return RISCV::F12_64; - case RISCV::F13_32: return RISCV::F13_64; - case RISCV::F14_32: return RISCV::F14_64; - case RISCV::F15_32: return RISCV::F15_64; - case RISCV::F16_32: return RISCV::F16_64; - case RISCV::F17_32: return RISCV::F17_64; - case RISCV::F18_32: return RISCV::F18_64; - case RISCV::F19_32: return RISCV::F19_64; - case RISCV::F20_32: return RISCV::F20_64; - case RISCV::F21_32: return RISCV::F21_64; - case RISCV::F22_32: return RISCV::F22_64; - case RISCV::F23_32: return RISCV::F23_64; - case RISCV::F24_32: return RISCV::F24_64; - case RISCV::F25_32: return RISCV::F25_64; - case RISCV::F26_32: return RISCV::F26_64; - case RISCV::F27_32: return RISCV::F27_64; - case RISCV::F28_32: return RISCV::F28_64; - case RISCV::F29_32: return RISCV::F29_64; - case RISCV::F30_32: return RISCV::F30_64; - case RISCV::F31_32: return RISCV::F31_64; - } +static Register convertFPR64ToFPR32(Register Reg) { + assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register"); + return Reg - RISCV::F0_D + RISCV::F0_F; } unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, @@ -778,17 +752,17 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, if (!Op.isReg()) return Match_InvalidOperand; - unsigned Reg = Op.getReg(); - bool IsRegFPR32 = - RISCVMCRegisterClasses[RISCV::FPR32RegClassID].contains(Reg); - bool IsRegFPR32C = - RISCVMCRegisterClasses[RISCV::FPR32CRegClassID].contains(Reg); + Register Reg = Op.getReg(); + bool IsRegFPR64 = + RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg); + bool IsRegFPR64C = + RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg); // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the - // register from FPR32 to FPR64 or FPR32C to FPR64C if necessary. - if ((IsRegFPR32 && Kind == MCK_FPR64) || - (IsRegFPR32C && Kind == MCK_FPR64C)) { - Op.Reg.RegNum = convertFPR32ToFPR64(Reg); + // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary. + if ((IsRegFPR64 && Kind == MCK_FPR32) || + (IsRegFPR64C && Kind == MCK_FPR32C)) { + Op.Reg.RegNum = convertFPR64ToFPR32(Reg); return Match_Success; } return Match_InvalidOperand; @@ -853,6 +827,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError(Operands, ErrorInfo, std::numeric_limits<int32_t>::min(), std::numeric_limits<uint32_t>::max()); + case Match_InvalidImmZero: { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); + return Error(ErrorLoc, "immediate must be zero"); + } case Match_InvalidUImmLog2XLen: if (isRV64()) return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1); @@ -968,14 +946,19 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // alternative ABI names), setting RegNo to the matching register. Upon // failure, returns true and sets RegNo to 0. If IsRV32E then registers // x16-x31 will be rejected. -static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo, +static bool matchRegisterNameHelper(bool IsRV32E, Register &RegNo, StringRef Name) { RegNo = MatchRegisterName(Name); - if (RegNo == 0) + // The 32- and 64-bit FPRs have the same asm name. Check that the initial + // match always matches the 64-bit variant, and not the 32-bit one. + assert(!(RegNo >= RISCV::F0_F && RegNo <= RISCV::F31_F)); + // The default FPR register class is based on the tablegen enum ordering. + static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated"); + if (RegNo == RISCV::NoRegister) RegNo = MatchRegisterAltName(Name); if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31) - RegNo = 0; - return RegNo == 0; + RegNo = RISCV::NoRegister; + return RegNo == RISCV::NoRegister; } bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, @@ -986,7 +969,7 @@ bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, RegNo = 0; StringRef Name = getLexer().getTok().getIdentifier(); - if (matchRegisterNameHelper(isRV32E(), RegNo, Name)) + if (matchRegisterNameHelper(isRV32E(), (Register&)RegNo, Name)) return Error(StartLoc, "invalid register name"); getParser().Lex(); // Eat identifier token. @@ -1018,10 +1001,10 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands, return MatchOperand_NoMatch; case AsmToken::Identifier: StringRef Name = getLexer().getTok().getIdentifier(); - unsigned RegNo; + Register RegNo; matchRegisterNameHelper(isRV32E(), RegNo, Name); - if (RegNo == 0) { + if (RegNo == RISCV::NoRegister) { if (HadParens) getLexer().UnLex(LParen); return MatchOperand_NoMatch; @@ -1208,6 +1191,24 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) { Res = V; } else Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); + + MCBinaryExpr::Opcode Opcode; + switch (getLexer().getKind()) { + default: + Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); + return MatchOperand_Success; + case AsmToken::Plus: + Opcode = MCBinaryExpr::Add; + break; + case AsmToken::Minus: + Opcode = MCBinaryExpr::Sub; + break; + } + + const MCExpr *Expr; + if (getParser().parseExpression(Expr)) + return MatchOperand_ParseFail; + Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext()); Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); return MatchOperand_Success; } @@ -1282,6 +1283,73 @@ RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) { return MatchOperand_Success; } +OperandMatchResultTy RISCVAsmParser::parseAtomicMemOp(OperandVector &Operands) { + // Atomic operations such as lr.w, sc.w, and amo*.w accept a "memory operand" + // as one of their register operands, such as `(a0)`. This just denotes that + // the register (in this case `a0`) contains a memory address. + // + // Normally, we would be able to parse these by putting the parens into the + // instruction string. However, GNU as also accepts a zero-offset memory + // operand (such as `0(a0)`), and ignores the 0. Normally this would be parsed + // with parseImmediate followed by parseMemOpBaseReg, but these instructions + // do not accept an immediate operand, and we do not want to add a "dummy" + // operand that is silently dropped. + // + // Instead, we use this custom parser. This will: allow (and discard) an + // offset if it is zero; require (and discard) parentheses; and add only the + // parsed register operand to `Operands`. + // + // These operands are printed with RISCVInstPrinter::printAtomicMemOp, which + // will only print the register surrounded by parentheses (which GNU as also + // uses as its canonical representation for these operands). + std::unique_ptr<RISCVOperand> OptionalImmOp; + + if (getLexer().isNot(AsmToken::LParen)) { + // Parse an Integer token. We do not accept arbritrary constant expressions + // in the offset field (because they may include parens, which complicates + // parsing a lot). + int64_t ImmVal; + SMLoc ImmStart = getLoc(); + if (getParser().parseIntToken(ImmVal, + "expected '(' or optional integer offset")) + return MatchOperand_ParseFail; + + // Create a RISCVOperand for checking later (so the error messages are + // nicer), but we don't add it to Operands. + SMLoc ImmEnd = getLoc(); + OptionalImmOp = + RISCVOperand::createImm(MCConstantExpr::create(ImmVal, getContext()), + ImmStart, ImmEnd, isRV64()); + } + + if (getLexer().isNot(AsmToken::LParen)) { + Error(getLoc(), OptionalImmOp ? "expected '(' after optional integer offset" + : "expected '(' or optional integer offset"); + return MatchOperand_ParseFail; + } + getParser().Lex(); // Eat '(' + + if (parseRegister(Operands) != MatchOperand_Success) { + Error(getLoc(), "expected register"); + return MatchOperand_ParseFail; + } + + if (getLexer().isNot(AsmToken::RParen)) { + Error(getLoc(), "expected ')'"); + return MatchOperand_ParseFail; + } + getParser().Lex(); // Eat ')' + + // Deferred Handling of non-zero offsets. This makes the error messages nicer. + if (OptionalImmOp && !OptionalImmOp->isImmZero()) { + Error(OptionalImmOp->getStartLoc(), "optional integer offset must be 0", + SMRange(OptionalImmOp->getStartLoc(), OptionalImmOp->getEndLoc())); + return MatchOperand_ParseFail; + } + + return MatchOperand_Success; +} + /// Looks at a token type and creates the relevant operand from this /// information, adding to Operands. If operand was parsed, returns false, else /// true. @@ -1523,12 +1591,12 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) { S.EmitInstruction((Res ? CInst : Inst), getSTI()); } -void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value, +void RISCVAsmParser::emitLoadImm(Register DestReg, int64_t Value, MCStreamer &Out) { RISCVMatInt::InstSeq Seq; RISCVMatInt::generateInstSeq(Value, isRV64(), Seq); - unsigned SrcReg = RISCV::X0; + Register SrcReg = RISCV::X0; for (RISCVMatInt::Inst &Inst : Seq) { if (Inst.Opc == RISCV::LUI) { emitToStreamer( @@ -1682,7 +1750,7 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, default: break; case RISCV::PseudoLI: { - unsigned Reg = Inst.getOperand(0).getReg(); + Register Reg = Inst.getOperand(0).getReg(); const MCOperand &Op1 = Inst.getOperand(1); if (Op1.isExpr()) { // We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar. diff --git a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 36200c03f703..15943ba42156 100644 --- a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -13,6 +13,7 @@ #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "TargetInfo/RISCVTargetInfo.h" #include "Utils/RISCVBaseInfo.h" +#include "llvm/CodeGen/Register.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" @@ -56,17 +57,6 @@ extern "C" void LLVMInitializeRISCVDisassembler() { createRISCVDisassembler); } -static const unsigned GPRDecoderTable[] = { - RISCV::X0, RISCV::X1, RISCV::X2, RISCV::X3, - RISCV::X4, RISCV::X5, RISCV::X6, RISCV::X7, - RISCV::X8, RISCV::X9, RISCV::X10, RISCV::X11, - RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, - RISCV::X16, RISCV::X17, RISCV::X18, RISCV::X19, - RISCV::X20, RISCV::X21, RISCV::X22, RISCV::X23, - RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27, - RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31 -}; - static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { @@ -76,38 +66,21 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo, .getFeatureBits(); bool IsRV32E = FeatureBits[RISCV::FeatureRV32E]; - if (RegNo > array_lengthof(GPRDecoderTable) || (IsRV32E && RegNo > 15)) + if (RegNo >= 32 || (IsRV32E && RegNo >= 16)) return MCDisassembler::Fail; - // We must define our own mapping from RegNo to register identifier. - // Accessing index RegNo in the register class will work in the case that - // registers were added in ascending order, but not in general. - unsigned Reg = GPRDecoderTable[RegNo]; + Register Reg = RISCV::X0 + RegNo; Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } -static const unsigned FPR32DecoderTable[] = { - RISCV::F0_32, RISCV::F1_32, RISCV::F2_32, RISCV::F3_32, - RISCV::F4_32, RISCV::F5_32, RISCV::F6_32, RISCV::F7_32, - RISCV::F8_32, RISCV::F9_32, RISCV::F10_32, RISCV::F11_32, - RISCV::F12_32, RISCV::F13_32, RISCV::F14_32, RISCV::F15_32, - RISCV::F16_32, RISCV::F17_32, RISCV::F18_32, RISCV::F19_32, - RISCV::F20_32, RISCV::F21_32, RISCV::F22_32, RISCV::F23_32, - RISCV::F24_32, RISCV::F25_32, RISCV::F26_32, RISCV::F27_32, - RISCV::F28_32, RISCV::F29_32, RISCV::F30_32, RISCV::F31_32 -}; - static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > array_lengthof(FPR32DecoderTable)) + if (RegNo >= 32) return MCDisassembler::Fail; - // We must define our own mapping from RegNo to register identifier. - // Accessing index RegNo in the register class will work in the case that - // registers were added in ascending order, but not in general. - unsigned Reg = FPR32DecoderTable[RegNo]; + Register Reg = RISCV::F0_F + RegNo; Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -115,35 +88,21 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo, static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > 8) { + if (RegNo >= 8) { return MCDisassembler::Fail; } - unsigned Reg = FPR32DecoderTable[RegNo + 8]; + Register Reg = RISCV::F8_F + RegNo; Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } -static const unsigned FPR64DecoderTable[] = { - RISCV::F0_64, RISCV::F1_64, RISCV::F2_64, RISCV::F3_64, - RISCV::F4_64, RISCV::F5_64, RISCV::F6_64, RISCV::F7_64, - RISCV::F8_64, RISCV::F9_64, RISCV::F10_64, RISCV::F11_64, - RISCV::F12_64, RISCV::F13_64, RISCV::F14_64, RISCV::F15_64, - RISCV::F16_64, RISCV::F17_64, RISCV::F18_64, RISCV::F19_64, - RISCV::F20_64, RISCV::F21_64, RISCV::F22_64, RISCV::F23_64, - RISCV::F24_64, RISCV::F25_64, RISCV::F26_64, RISCV::F27_64, - RISCV::F28_64, RISCV::F29_64, RISCV::F30_64, RISCV::F31_64 -}; - static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > array_lengthof(FPR64DecoderTable)) + if (RegNo >= 32) return MCDisassembler::Fail; - // We must define our own mapping from RegNo to register identifier. - // Accessing index RegNo in the register class will work in the case that - // registers were added in ascending order, but not in general. - unsigned Reg = FPR64DecoderTable[RegNo]; + Register Reg = RISCV::F0_D + RegNo; Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -151,10 +110,10 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo, static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > 8) { + if (RegNo >= 8) { return MCDisassembler::Fail; } - unsigned Reg = FPR64DecoderTable[RegNo + 8]; + Register Reg = RISCV::F8_D + RegNo; Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -182,10 +141,10 @@ static DecodeStatus DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > 8) + if (RegNo >= 8) return MCDisassembler::Fail; - unsigned Reg = GPRDecoderTable[RegNo + 8]; + Register Reg = RISCV::X8 + RegNo; Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -279,8 +238,80 @@ static DecodeStatus decodeFRMArg(MCInst &Inst, uint64_t Imm, return MCDisassembler::Success; } +static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); + #include "RISCVGenDisassemblerTables.inc" +static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + uint64_t SImm6 = + fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); + DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder); + (void)Result; + assert(Result == MCDisassembler::Success && "Invalid immediate"); + return MCDisassembler::Success; +} + +static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + DecodeGPRRegisterClass(Inst, 0, Address, Decoder); + uint64_t SImm6 = + fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); + DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder); + (void)Result; + assert(Result == MCDisassembler::Success && "Invalid immediate"); + return MCDisassembler::Success; +} + +static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + DecodeGPRRegisterClass(Inst, 0, Address, Decoder); + Inst.addOperand(Inst.getOperand(0)); + uint64_t UImm6 = + fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); + DecodeStatus Result = decodeUImmOperand<6>(Inst, UImm6, Address, Decoder); + (void)Result; + assert(Result == MCDisassembler::Success && "Invalid immediate"); + return MCDisassembler::Success; +} + +static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned Rd = fieldFromInstruction(Insn, 7, 5); + unsigned Rs2 = fieldFromInstruction(Insn, 2, 5); + DecodeGPRRegisterClass(Inst, Rd, Address, Decoder); + DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder); + return MCDisassembler::Success; +} + +static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(Insn, 7, 5); + unsigned Rs2 = fieldFromInstruction(Insn, 2, 5); + DecodeGPRRegisterClass(Inst, Rd, Address, Decoder); + Inst.addOperand(Inst.getOperand(0)); + DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder); + return MCDisassembler::Success; +} + DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index ee5f760ebcb0..f6b727ae37c7 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -30,9 +30,16 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCValue &Target) { bool ShouldForce = false; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: break; + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + case FK_Data_8: + if (Target.isAbsolute()) + return false; + break; case RISCV::fixup_riscv_got_hi20: case RISCV::fixup_riscv_tls_got_hi20: case RISCV::fixup_riscv_tls_gd_hi20: @@ -48,7 +55,7 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm, return false; } - switch ((unsigned)T->getKind()) { + switch (T->getTargetKind()) { default: llvm_unreachable("Unexpected fixup kind for pcrel_lo12"); break; @@ -83,7 +90,7 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, return true; int64_t Offset = int64_t(Value); - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: return false; case RISCV::fixup_riscv_rvc_branch: @@ -174,8 +181,7 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext &Ctx) { - unsigned Kind = Fixup.getKind(); - switch (Kind) { + switch (Fixup.getTargetKind()) { default: llvm_unreachable("Unknown fixup kind!"); case RISCV::fixup_riscv_got_hi20: @@ -186,6 +192,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case FK_Data_2: case FK_Data_4: case FK_Data_8: + case FK_Data_6b: return Value; case RISCV::fixup_riscv_lo12_i: case RISCV::fixup_riscv_pcrel_lo12_i: diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 3ccbc86d2619..cab2bbcb81bc 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/RISCVFixupKinds.h" +#include "MCTargetDesc/RISCVMCExpr.h" #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixup.h" @@ -47,8 +48,9 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { + const MCExpr *Expr = Fixup.getValue(); // Determine the type of the relocation - unsigned Kind = Fixup.getKind(); + unsigned Kind = Fixup.getTargetKind(); if (IsPCRel) { switch (Kind) { default: @@ -87,6 +89,9 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, default: llvm_unreachable("invalid fixup kind!"); case FK_Data_4: + if (Expr->getKind() == MCExpr::Target && + cast<RISCVMCExpr>(Expr)->getKind() == RISCVMCExpr::VK_RISCV_32_PCREL) + return ELF::R_RISCV_32_PCREL; return ELF::R_RISCV_32; case FK_Data_8: return ELF::R_RISCV_64; @@ -98,6 +103,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_RISCV_ADD32; case FK_Data_Add_8: return ELF::R_RISCV_ADD64; + case FK_Data_Add_6b: + return ELF::R_RISCV_SET6; case FK_Data_Sub_1: return ELF::R_RISCV_SUB8; case FK_Data_Sub_2: @@ -106,6 +113,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_RISCV_SUB32; case FK_Data_Sub_8: return ELF::R_RISCV_SUB64; + case FK_Data_Sub_6b: + return ELF::R_RISCV_SUB6; case RISCV::fixup_riscv_hi20: return ELF::R_RISCV_HI20; case RISCV::fixup_riscv_lo12_i: @@ -129,5 +138,5 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createRISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit) { - return llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit); + return std::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit); } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index fe37b70811d8..8b5fe6dd8252 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -39,6 +39,30 @@ static cl::opt<bool> cl::desc("Disable the emission of assembler pseudo instructions"), cl::init(false), cl::Hidden); +static cl::opt<bool> + ArchRegNames("riscv-arch-reg-names", + cl::desc("Print architectural register names rather than the " + "ABI names (such as x2 instead of sp)"), + cl::init(false), cl::Hidden); + +// The command-line flags above are used by llvm-mc and llc. They can be used by +// `llvm-objdump`, but we override their values here to handle options passed to +// `llvm-objdump` with `-M` (which matches GNU objdump). There did not seem to +// be an easier way to allow these options in all these tools, without doing it +// this way. +bool RISCVInstPrinter::applyTargetSpecificCLOption(StringRef Opt) { + if (Opt == "no-aliases") { + NoAliases = true; + return true; + } + if (Opt == "numeric") { + ArchRegNames = true; + return true; + } + + return false; +} + void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) { bool Res = false; @@ -112,3 +136,20 @@ void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo, static_cast<RISCVFPRndMode::RoundingMode>(MI->getOperand(OpNo).getImm()); O << RISCVFPRndMode::roundingModeToString(FRMArg); } + +void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNo); + + assert(MO.isReg() && "printAtomicMemOp can only print register operands"); + O << "("; + printRegName(O, MO.getReg()); + O << ")"; + return; +} + +const char *RISCVInstPrinter::getRegisterName(unsigned RegNo) { + return getRegisterName(RegNo, ArchRegNames ? RISCV::NoRegAltName + : RISCV::ABIRegAltName); +} diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h index 5ca1d3fa20fe..189d72626f3e 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h @@ -25,6 +25,8 @@ public: const MCRegisterInfo &MRI) : MCInstPrinter(MAI, MII, MRI) {} + bool applyTargetSpecificCLOption(StringRef Opt) override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; void printRegName(raw_ostream &O, unsigned RegNo) const override; @@ -37,6 +39,8 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printAtomicMemOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); // Autogenerated by tblgen. void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, @@ -46,8 +50,8 @@ public: void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = RISCV::ABIRegAltName); + static const char *getRegisterName(unsigned RegNo); + static const char *getRegisterName(unsigned RegNo, unsigned AltIdx); }; } // namespace llvm diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp index 983629692883..089a2def4c21 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp @@ -11,7 +11,10 @@ //===----------------------------------------------------------------------===// #include "RISCVMCAsmInfo.h" +#include "MCTargetDesc/RISCVMCExpr.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/MC/MCStreamer.h" using namespace llvm; void RISCVMCAsmInfo::anchor() {} @@ -25,3 +28,20 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) { Data16bitsDirective = "\t.half\t"; Data32bitsDirective = "\t.word\t"; } + +const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + if (!(Encoding & dwarf::DW_EH_PE_pcrel)) + return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer); + + // The default symbol subtraction results in an ADD/SUB relocation pair. + // Processing this relocation pair is problematic when linker relaxation is + // enabled, so we follow binutils in using the R_RISCV_32_PCREL relocation + // for the FDE initial location. + MCContext &Ctx = Streamer.getContext(); + const MCExpr *ME = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx); + assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding"); + return RISCVMCExpr::create(ME, RISCVMCExpr::VK_RISCV_32_PCREL, Ctx); +} diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h index 043fdb7c08c0..6824baf699aa 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h @@ -23,6 +23,9 @@ class RISCVMCAsmInfo : public MCAsmInfoELF { public: explicit RISCVMCAsmInfo(const Triple &TargetTriple); + + const MCExpr *getExprForFDESymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; }; } // namespace llvm diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 0fc775f63ed4..de99960848a5 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "Utils/RISCVBaseInfo.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/Register.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -100,7 +101,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS, const MCSubtargetInfo &STI) const { MCInst TmpInst; MCOperand Func; - unsigned Ra; + Register Ra; if (MI.getOpcode() == RISCV::PseudoTAIL) { Func = MI.getOperand(0); Ra = RISCV::X6; @@ -266,6 +267,7 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, switch (RVExpr->getKind()) { case RISCVMCExpr::VK_RISCV_None: case RISCVMCExpr::VK_RISCV_Invalid: + case RISCVMCExpr::VK_RISCV_32_PCREL: llvm_unreachable("Unhandled fixup kind!"); case RISCVMCExpr::VK_RISCV_TPREL_ADD: // tprel_add is only used to indicate that a relocation should be emitted diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h index b5a292dc1b1a..921df376f3df 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h @@ -36,6 +36,7 @@ public: VK_RISCV_TLS_GD_HI, VK_RISCV_CALL, VK_RISCV_CALL_PLT, + VK_RISCV_32_PCREL, VK_RISCV_Invalid }; diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp index bc45262ab2de..5a4c86e48f1e 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp @@ -16,7 +16,9 @@ #include "RISCVMCAsmInfo.h" #include "RISCVTargetStreamer.h" #include "TargetInfo/RISCVTargetInfo.h" +#include "Utils/RISCVBaseInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/Register.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -52,7 +54,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT) { MCAsmInfo *MAI = new RISCVMCAsmInfo(TT); - unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true); + Register SP = MRI.getDwarfRegNum(RISCV::X2, true); MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0); MAI->addInitialFrameState(Inst); diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h index 834a1d171143..f23f742a4782 100644 --- a/lib/Target/RISCV/RISCV.h +++ b/lib/Target/RISCV/RISCV.h @@ -18,9 +18,12 @@ #include "llvm/Target/TargetMachine.h" namespace llvm { +class RISCVRegisterBankInfo; +class RISCVSubtarget; class RISCVTargetMachine; class AsmPrinter; class FunctionPass; +class InstructionSelector; class MCInst; class MCOperand; class MachineInstr; @@ -39,6 +42,10 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); FunctionPass *createRISCVExpandPseudoPass(); void initializeRISCVExpandPseudoPass(PassRegistry &); + +InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &, + RISCVSubtarget &, + RISCVRegisterBankInfo &); } #endif diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td index e19b70b8e709..46530a8f74a8 100644 --- a/lib/Target/RISCV/RISCV.td +++ b/lib/Target/RISCV/RISCV.td @@ -43,6 +43,11 @@ def FeatureStdExtC def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">, AssemblerPredicate<"FeatureStdExtC">; +def FeatureRVCHints + : SubtargetFeature<"rvc-hints", "EnableRVCHintInstrs", "true", + "Enable RVC Hint Instructions.">; +def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">, + AssemblerPredicate<"FeatureRVCHints">; def Feature64Bit : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">; @@ -77,14 +82,16 @@ include "RISCVSystemOperands.td" include "RISCVRegisterInfo.td" include "RISCVCallingConv.td" include "RISCVInstrInfo.td" +include "RISCVRegisterBanks.td" //===----------------------------------------------------------------------===// // RISC-V processors supported. //===----------------------------------------------------------------------===// -def : ProcessorModel<"generic-rv32", NoSchedModel, []>; +def : ProcessorModel<"generic-rv32", NoSchedModel, [FeatureRVCHints]>; -def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>; +def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit, + FeatureRVCHints]>; //===----------------------------------------------------------------------===// // Define the RISC-V target. diff --git a/lib/Target/RISCV/RISCVCallLowering.cpp b/lib/Target/RISCV/RISCVCallLowering.cpp new file mode 100644 index 000000000000..c63a84739c4a --- /dev/null +++ b/lib/Target/RISCV/RISCVCallLowering.cpp @@ -0,0 +1,50 @@ +//===-- RISCVCallLowering.cpp - Call lowering -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +// +//===----------------------------------------------------------------------===// + +#include "RISCVCallLowering.h" +#include "RISCVISelLowering.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" + +using namespace llvm; + +RISCVCallLowering::RISCVCallLowering(const RISCVTargetLowering &TLI) + : CallLowering(&TLI) {} + +bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, + ArrayRef<Register> VRegs) const { + + MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(RISCV::PseudoRET); + + if (Val != nullptr) { + return false; + } + MIRBuilder.insertInstr(Ret); + return true; +} + +bool RISCVCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const { + + if (F.arg_empty()) + return true; + + return false; +} + +bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + return false; +} diff --git a/lib/Target/RISCV/RISCVCallLowering.h b/lib/Target/RISCV/RISCVCallLowering.h new file mode 100644 index 000000000000..7ce074a61f0a --- /dev/null +++ b/lib/Target/RISCV/RISCVCallLowering.h @@ -0,0 +1,42 @@ +//===-- RISCVCallLowering.h - Call lowering ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H +#define LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H + +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/ValueTypes.h" + +namespace llvm { + +class RISCVTargetLowering; + +class RISCVCallLowering : public CallLowering { + +public: + RISCVCallLowering(const RISCVTargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, + ArrayRef<Register> VRegs) const override; + + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const override; + + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td index db13e6e8beca..025454f8fcca 100644 --- a/lib/Target/RISCV/RISCVCallingConv.td +++ b/lib/Target/RISCV/RISCVCallingConv.td @@ -18,11 +18,11 @@ def CSR_ILP32_LP64 def CSR_ILP32F_LP64F : CalleeSavedRegs<(add CSR_ILP32_LP64, - F8_32, F9_32, (sequence "F%u_32", 18, 27))>; + F8_F, F9_F, (sequence "F%u_F", 18, 27))>; def CSR_ILP32D_LP64D : CalleeSavedRegs<(add CSR_ILP32_LP64, - F8_64, F9_64, (sequence "F%u_64", 18, 27))>; + F8_D, F9_D, (sequence "F%u_D", 18, 27))>; // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask() def CSR_NoRegs : CalleeSavedRegs<(add)>; @@ -43,12 +43,12 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add X1, (sequence "X%u", 12, 17), (sequence "X%u", 18, 27), (sequence "X%u", 28, 31), - (sequence "F%u_32", 0, 7), - (sequence "F%u_32", 10, 11), - (sequence "F%u_32", 12, 17), - (sequence "F%u_32", 28, 31), - (sequence "F%u_32", 8, 9), - (sequence "F%u_32", 18, 27))>; + (sequence "F%u_F", 0, 7), + (sequence "F%u_F", 10, 11), + (sequence "F%u_F", 12, 17), + (sequence "F%u_F", 28, 31), + (sequence "F%u_F", 8, 9), + (sequence "F%u_F", 18, 27))>; // Same as CSR_Interrupt, but including all 64-bit FP registers. def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1, @@ -57,9 +57,9 @@ def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1, (sequence "X%u", 12, 17), (sequence "X%u", 18, 27), (sequence "X%u", 28, 31), - (sequence "F%u_64", 0, 7), - (sequence "F%u_64", 10, 11), - (sequence "F%u_64", 12, 17), - (sequence "F%u_64", 28, 31), - (sequence "F%u_64", 8, 9), - (sequence "F%u_64", 18, 27))>; + (sequence "F%u_D", 0, 7), + (sequence "F%u_D", 10, 11), + (sequence "F%u_D", 12, 17), + (sequence "F%u_D", 28, 31), + (sequence "F%u_D", 8, 9), + (sequence "F%u_D", 18, 27))>; diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 1c5171a7b7a4..da5cd16e750c 100644 --- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -235,10 +235,10 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, MachineBasicBlock *LoopMBB, MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) { - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned ScratchReg = MI.getOperand(1).getReg(); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned IncrReg = MI.getOperand(3).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register IncrReg = MI.getOperand(3).getReg(); AtomicOrdering Ordering = static_cast<AtomicOrdering>(MI.getOperand(4).getImm()); @@ -271,9 +271,9 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, } static void insertMaskedMerge(const RISCVInstrInfo *TII, DebugLoc DL, - MachineBasicBlock *MBB, unsigned DestReg, - unsigned OldValReg, unsigned NewValReg, - unsigned MaskReg, unsigned ScratchReg) { + MachineBasicBlock *MBB, Register DestReg, + Register OldValReg, Register NewValReg, + Register MaskReg, Register ScratchReg) { assert(OldValReg != ScratchReg && "OldValReg and ScratchReg must be unique"); assert(OldValReg != MaskReg && "OldValReg and MaskReg must be unique"); assert(ScratchReg != MaskReg && "ScratchReg and MaskReg must be unique"); @@ -297,11 +297,11 @@ static void doMaskedAtomicBinOpExpansion( MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB, MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) { assert(Width == 32 && "Should never need to expand masked 64-bit operations"); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned ScratchReg = MI.getOperand(1).getReg(); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned IncrReg = MI.getOperand(3).getReg(); - unsigned MaskReg = MI.getOperand(4).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register IncrReg = MI.getOperand(3).getReg(); + Register MaskReg = MI.getOperand(4).getReg(); AtomicOrdering Ordering = static_cast<AtomicOrdering>(MI.getOperand(5).getImm()); @@ -394,8 +394,8 @@ bool RISCVExpandPseudo::expandAtomicBinOp( } static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL, - MachineBasicBlock *MBB, unsigned ValReg, - unsigned ShamtReg) { + MachineBasicBlock *MBB, Register ValReg, + Register ShamtReg) { BuildMI(MBB, DL, TII->get(RISCV::SLL), ValReg) .addReg(ValReg) .addReg(ShamtReg); @@ -436,12 +436,12 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp( DoneMBB->transferSuccessors(&MBB); MBB.addSuccessor(LoopHeadMBB); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned Scratch1Reg = MI.getOperand(1).getReg(); - unsigned Scratch2Reg = MI.getOperand(2).getReg(); - unsigned AddrReg = MI.getOperand(3).getReg(); - unsigned IncrReg = MI.getOperand(4).getReg(); - unsigned MaskReg = MI.getOperand(5).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register Scratch1Reg = MI.getOperand(1).getReg(); + Register Scratch2Reg = MI.getOperand(2).getReg(); + Register AddrReg = MI.getOperand(3).getReg(); + Register IncrReg = MI.getOperand(4).getReg(); + Register MaskReg = MI.getOperand(5).getReg(); bool IsSigned = BinOp == AtomicRMWInst::Min || BinOp == AtomicRMWInst::Max; AtomicOrdering Ordering = static_cast<AtomicOrdering>(MI.getOperand(IsSigned ? 7 : 6).getImm()); @@ -549,11 +549,11 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg( DoneMBB->transferSuccessors(&MBB); MBB.addSuccessor(LoopHeadMBB); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned ScratchReg = MI.getOperand(1).getReg(); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned CmpValReg = MI.getOperand(3).getReg(); - unsigned NewValReg = MI.getOperand(4).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register CmpValReg = MI.getOperand(3).getReg(); + Register NewValReg = MI.getOperand(4).getReg(); AtomicOrdering Ordering = static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm()); @@ -582,7 +582,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg( // lr.w dest, (addr) // and scratch, dest, mask // bne scratch, cmpval, done - unsigned MaskReg = MI.getOperand(5).getReg(); + Register MaskReg = MI.getOperand(5).getReg(); BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg) .addReg(AddrReg); BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg) @@ -629,7 +629,7 @@ bool RISCVExpandPseudo::expandAuipcInstPair( MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); const MachineOperand &Symbol = MI.getOperand(1); MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp index 32c3b9684d2c..6b6f62e18ce9 100644 --- a/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -40,8 +40,16 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const { uint64_t FrameSize = MFI.getStackSize(); // Get the alignment. - uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment() - : getStackAlignment(); + unsigned StackAlign = getStackAlignment(); + if (RI->needsStackRealignment(MF)) { + unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment()); + FrameSize += (MaxStackAlign - StackAlign); + StackAlign = MaxStackAlign; + } + + // Set Max Call Frame Size + uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign); + MFI.setMaxCallFrameSize(MaxCallSize); // Make sure the frame is aligned. FrameSize = alignTo(FrameSize, StackAlign); @@ -52,8 +60,8 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const { void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, int64_t Val, + const DebugLoc &DL, Register DestReg, + Register SrcReg, int64_t Val, MachineInstr::MIFlag Flag) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const RISCVInstrInfo *TII = STI.getInstrInfo(); @@ -66,7 +74,7 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB, .addReg(SrcReg) .addImm(Val) .setMIFlag(Flag); - } else if (isInt<32>(Val)) { + } else { unsigned Opc = RISCV::ADD; bool isSub = Val < 0; if (isSub) { @@ -74,22 +82,20 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB, Opc = RISCV::SUB; } - unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); - TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag); + Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); + TII->movImm(MBB, MBBI, DL, ScratchReg, Val, Flag); BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) .addReg(SrcReg) .addReg(ScratchReg, RegState::Kill) .setMIFlag(Flag); - } else { - report_fatal_error("adjustReg cannot yet handle adjustments >32 bits"); } } // Returns the register used to hold the frame pointer. -static unsigned getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; } +static Register getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; } // Returns the register used to hold the stack pointer. -static unsigned getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; } +static Register getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; } void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { @@ -101,8 +107,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, const RISCVInstrInfo *TII = STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); - unsigned FPReg = getFPReg(STI); - unsigned SPReg = getSPReg(STI); + if (RI->needsStackRealignment(MF) && MFI.hasVarSizedObjects()) { + report_fatal_error( + "RISC-V backend can't currently handle functions that need stack " + "realignment and have variable sized objects"); + } + + Register FPReg = getFPReg(STI); + Register SPReg = getSPReg(STI); // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. @@ -119,6 +131,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, if (StackSize == 0 && !MFI.adjustsStack()) return; + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + // Split the SP adjustment to reduce the offsets of callee saved spill. + if (FirstSPAdjustAmount) + StackSize = FirstSPAdjustAmount; + // Allocate space on the stack if necessary. adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); @@ -141,7 +158,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // directives. for (const auto &Entry : CSI) { int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx()); - unsigned Reg = Entry.getReg(); + Register Reg = Entry.getReg(); unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( nullptr, RI->getDwarfRegNum(Reg, true), Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) @@ -159,6 +176,45 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } + + // Emit the second SP adjustment after saving callee saved registers. + if (FirstSPAdjustAmount) { + uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, + MachineInstr::FrameSetup); + // Emit ".cfi_def_cfa_offset StackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize())); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (hasFP(MF)) { + // Realign Stack + const RISCVRegisterInfo *RI = STI.getRegisterInfo(); + if (RI->needsStackRealignment(MF)) { + unsigned MaxAlignment = MFI.getMaxAlignment(); + + const RISCVInstrInfo *TII = STI.getInstrInfo(); + if (isInt<12>(-(int)MaxAlignment)) { + BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg) + .addReg(SPReg) + .addImm(-(int)MaxAlignment); + } else { + unsigned ShiftAmount = countTrailingZeros(MaxAlignment); + Register VR = + MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass); + BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR) + .addReg(SPReg) + .addImm(ShiftAmount); + BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg) + .addReg(VR) + .addImm(ShiftAmount); + } + } + } } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -169,8 +225,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>(); DebugLoc DL = MBBI->getDebugLoc(); const RISCVInstrInfo *TII = STI.getInstrInfo(); - unsigned FPReg = getFPReg(STI); - unsigned SPReg = getSPReg(STI); + Register FPReg = getFPReg(STI); + Register SPReg = getSPReg(STI); // Skip to before the restores of callee-saved registers // FIXME: assumes exactly one instruction is used to restore each @@ -189,11 +245,29 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, MachineInstr::FrameDestroy); } + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + if (FirstSPAdjustAmount) { + uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + + adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount, + MachineInstr::FrameDestroy); + + // Emit ".cfi_def_cfa_offset FirstSPAdjustAmount" + unsigned CFIIndex = + MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, + -FirstSPAdjustAmount)); + BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + if (hasFP(MF)) { // To find the instruction restoring FP from stack. for (auto &I = LastFrameDestroy; I != MBBI; ++I) { if (I->mayLoad() && I->getOperand(0).isReg()) { - unsigned DestReg = I->getOperand(0).getReg(); + Register DestReg = I->getOperand(0).getReg(); if (DestReg == FPReg) { // If there is frame pointer, after restoring $fp registers, we // need adjust CFA to ($sp - FPOffset). @@ -214,13 +288,16 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // Iterate over list of callee-saved registers and emit .cfi_restore // directives. for (const auto &Entry : CSI) { - unsigned Reg = Entry.getReg(); + Register Reg = Entry.getReg(); unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore( nullptr, RI->getDwarfRegNum(Reg, true))); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } + if (FirstSPAdjustAmount) + StackSize = FirstSPAdjustAmount; + // Deallocate stack adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); @@ -249,6 +326,8 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + MFI.getOffsetAdjustment(); + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + if (CSI.size()) { MinCSFI = CSI[0].getFrameIdx(); MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); @@ -256,6 +335,17 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, if (FI >= MinCSFI && FI <= MaxCSFI) { FrameReg = RISCV::X2; + + if (FirstSPAdjustAmount) + Offset += FirstSPAdjustAmount; + else + Offset += MF.getFrameInfo().getStackSize(); + } else if (RI->needsStackRealignment(MF)) { + assert(!MFI.hasVarSizedObjects() && + "Unexpected combination of stack realignment and varsized objects"); + // If the stack was realigned, the frame pointer is set in order to allow + // SP to be restored, but we still access stack objects using SP. + FrameReg = RISCV::X2; Offset += MF.getFrameInfo().getStackSize(); } else { FrameReg = RI->getFrameRegister(MF); @@ -338,7 +428,7 @@ bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { - unsigned SPReg = RISCV::X2; + Register SPReg = RISCV::X2; DebugLoc DL = MI->getDebugLoc(); if (!hasReservedCallFrame(MF)) { @@ -362,3 +452,39 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(MI); } + +// We would like to split the SP adjustment to reduce prologue/epilogue +// as following instructions. In this way, the offset of the callee saved +// register could fit in a single store. +// add sp,sp,-2032 +// sw ra,2028(sp) +// sw s0,2024(sp) +// sw s1,2020(sp) +// sw s3,2012(sp) +// sw s4,2008(sp) +// add sp,sp,-64 +uint64_t +RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + uint64_t StackSize = MFI.getStackSize(); + uint64_t StackAlign = getStackAlignment(); + + // FIXME: Disable SplitSPAdjust if save-restore libcall enabled when the patch + // landing. The callee saved registers will be pushed by the + // save-restore libcalls, so we don't have to split the SP adjustment + // in this case. + // + // Return the FirstSPAdjustAmount if the StackSize can not fit in signed + // 12-bit and there exists a callee saved register need to be pushed. + if (!isInt<12>(StackSize) && (CSI.size() > 0)) { + // FirstSPAdjustAmount is choosed as (2048 - StackAlign) + // because 2048 will cause sp = sp + 2048 in epilogue split into + // multi-instructions. The offset smaller than 2048 can fit in signle + // load/store instruction and we have to stick with the stack alignment. + // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16, + // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment. + return 2048 - StackAlign; + } + return 0; +} diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h index 0e045c3ff853..f4a5949773d9 100644 --- a/lib/Target/RISCV/RISCVFrameLowering.h +++ b/lib/Target/RISCV/RISCVFrameLowering.h @@ -22,7 +22,7 @@ class RISCVFrameLowering : public TargetFrameLowering { public: explicit RISCVFrameLowering(const RISCVSubtarget &STI) : TargetFrameLowering(StackGrowsDown, - /*StackAlignment=*/16, + /*StackAlignment=*/Align(16), /*LocalAreaOffset=*/0), STI(STI) {} @@ -45,13 +45,18 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + // Get the first stack adjustment amount for SplitSPAdjust. + // Return 0 if we don't want to to split the SP adjustment in prologue and + // epilogue. + uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const; + protected: const RISCVSubtarget &STI; private: void determineFrameLayout(MachineFunction &MF) const; void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, Register DestReg, Register SrcReg, int64_t Val, MachineInstr::MIFlag Flag) const; }; } diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index d0a3af375a6d..1a12d9177d2a 100644 --- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -68,7 +68,7 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm, RISCVMatInt::InstSeq Seq; RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq); - SDNode *Result; + SDNode *Result = nullptr; SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT); for (RISCVMatInt::Inst &Inst : Seq) { SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT); @@ -179,6 +179,9 @@ bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand( // operand and need no special handling. OutOps.push_back(Op); return false; + case InlineAsm::Constraint_A: + OutOps.push_back(Op); + return false; default: break; } diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp index ce7b85911ab6..dc829fce9013 100644 --- a/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/lib/Target/RISCV/RISCVISelLowering.cpp @@ -100,6 +100,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (Subtarget.is64Bit()) { + setOperationAction(ISD::ADD, MVT::i32, Custom); + setOperationAction(ISD::SUB, MVT::i32, Custom); setOperationAction(ISD::SHL, MVT::i32, Custom); setOperationAction(ISD::SRA, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i32, Custom); @@ -116,6 +118,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) { + setOperationAction(ISD::MUL, MVT::i32, Custom); setOperationAction(ISD::SDIV, MVT::i32, Custom); setOperationAction(ISD::UDIV, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Custom); @@ -194,8 +197,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setBooleanContents(ZeroOrOneBooleanContent); - // Function alignments (log2). - unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2; + // Function alignments. + const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4); setMinFunctionAlignment(FunctionAlignment); setPrefFunctionAlignment(FunctionAlignment); @@ -231,7 +234,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 4; + Info.align = Align(4); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; @@ -660,7 +663,7 @@ SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); - unsigned FrameReg = RI.getFrameRegister(MF); + Register FrameReg = RI.getFrameRegister(MF); int XLenInBytes = Subtarget.getXLen() / 8; EVT VT = Op.getValueType(); @@ -703,7 +706,7 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op, // Return the value of the return address register, marking it an implicit // live-in. - unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT)); + Register Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT)); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT); } @@ -834,6 +837,18 @@ static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes); } +// Converts the given 32-bit operation to a i64 operation with signed extension +// semantic to reduce the signed extension instructions. +static SDValue customLegalizeToWOpWithSExt(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); + SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); + SDValue NewWOp = DAG.getNode(N->getOpcode(), DL, MVT::i64, NewOp0, NewOp1); + SDValue NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewWOp, + DAG.getValueType(MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes); +} + void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -854,6 +869,15 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(RCW.getValue(2)); break; } + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + if (N->getOperand(1).getOpcode() == ISD::Constant) + return; + Results.push_back(customLegalizeToWOpWithSExt(N, DAG)); + break; case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -1007,12 +1031,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift( // We can materialise `c1 << c2` into an add immediate, so it's "free", // and the combine should happen, to potentially allow further combines // later. - if (isLegalAddImmediate(ShiftedC1Int.getSExtValue())) + if (ShiftedC1Int.getMinSignedBits() <= 64 && + isLegalAddImmediate(ShiftedC1Int.getSExtValue())) return true; // We can materialise `c1` in an add immediate, so it's "free", and the // combine should be prevented. - if (isLegalAddImmediate(C1Int.getSExtValue())) + if (C1Int.getMinSignedBits() <= 64 && + isLegalAddImmediate(C1Int.getSExtValue())) return false; // Neither constant will fit into an immediate, so find materialisation @@ -1052,8 +1078,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( return 1; } -MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, - MachineBasicBlock *BB) { +static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, + MachineBasicBlock *BB) { assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction"); // To read the 64-bit cycle CSR on a 32-bit target, we read the two halves. @@ -1085,9 +1111,9 @@ MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, BB->addSuccessor(LoopMBB); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); - unsigned LoReg = MI.getOperand(0).getReg(); - unsigned HiReg = MI.getOperand(1).getReg(); + Register ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + Register LoReg = MI.getOperand(0).getReg(); + Register HiReg = MI.getOperand(1).getReg(); DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); @@ -1122,9 +1148,9 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); - unsigned LoReg = MI.getOperand(0).getReg(); - unsigned HiReg = MI.getOperand(1).getReg(); - unsigned SrcReg = MI.getOperand(2).getReg(); + Register LoReg = MI.getOperand(0).getReg(); + Register HiReg = MI.getOperand(1).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass; int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(); @@ -1154,9 +1180,9 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned LoReg = MI.getOperand(1).getReg(); - unsigned HiReg = MI.getOperand(2).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register LoReg = MI.getOperand(1).getReg(); + Register HiReg = MI.getOperand(2).getReg(); const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass; int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(); @@ -1215,12 +1241,12 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, // previous selects in the sequence. // These conditions could be further relaxed. See the X86 target for a // related approach and more information. - unsigned LHS = MI.getOperand(1).getReg(); - unsigned RHS = MI.getOperand(2).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm()); SmallVector<MachineInstr *, 4> SelectDebugValues; - SmallSet<unsigned, 4> SelectDests; + SmallSet<Register, 4> SelectDests; SelectDests.insert(MI.getOperand(0).getReg()); MachineInstr *LastSelectPseudo = &MI; @@ -1363,12 +1389,12 @@ static const MCPhysReg ArgGPRs[] = { RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17 }; static const MCPhysReg ArgFPR32s[] = { - RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32, - RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32 + RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, + RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F }; static const MCPhysReg ArgFPR64s[] = { - RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64, - RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64 + RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, + RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D }; // Pass a 2*XLEN argument that has been split into two XLEN values through @@ -1378,7 +1404,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, MVT ValVT2, MVT LocVT2, ISD::ArgFlagsTy ArgFlags2) { unsigned XLenInBytes = XLen / 8; - if (unsigned Reg = State.AllocateReg(ArgGPRs)) { + if (Register Reg = State.AllocateReg(ArgGPRs)) { // At least one half can be passed via register. State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg, VA1.getLocVT(), CCValAssign::Full)); @@ -1395,7 +1421,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, return false; } - if (unsigned Reg = State.AllocateReg(ArgGPRs)) { + if (Register Reg = State.AllocateReg(ArgGPRs)) { // The second half can also be passed via register. State.addLoc( CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full)); @@ -1495,7 +1521,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, // GPRs, split between a GPR and the stack, or passed completely on the // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these // cases. - unsigned Reg = State.AllocateReg(ArgGPRs); + Register Reg = State.AllocateReg(ArgGPRs); LocVT = MVT::i32; if (!Reg) { unsigned StackOffset = State.AllocateStack(8, 8); @@ -1537,7 +1563,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, } // Allocate to a register if possible, or else a stack slot. - unsigned Reg; + Register Reg; if (ValVT == MVT::f32 && !UseGPRForF32) Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s); else if (ValVT == MVT::f64 && !UseGPRForF64) @@ -1673,7 +1699,7 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain, break; } - unsigned VReg = RegInfo.createVirtualRegister(RC); + Register VReg = RegInfo.createVirtualRegister(RC); RegInfo.addLiveIn(VA.getLocReg(), VReg); Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT); @@ -1751,7 +1777,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, assert(VA.isRegLoc() && "Expected register VA assignment"); - unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + Register LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); RegInfo.addLiveIn(VA.getLocReg(), LoVReg); SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32); SDValue Hi; @@ -1763,13 +1789,70 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, MachinePointerInfo::getFixedStack(MF, FI)); } else { // Second half of f64 is passed in another GPR. - unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + Register HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg); Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32); } return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } +// FastCC has less than 1% performance improvement for some particular +// benchmark. But theoretically, it may has benenfit for some cases. +static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + + if (LocVT == MVT::i32 || LocVT == MVT::i64) { + // X5 and X6 might be used for save-restore libcall. + static const MCPhysReg GPRList[] = { + RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, + RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28, + RISCV::X29, RISCV::X30, RISCV::X31}; + if (unsigned Reg = State.AllocateReg(GPRList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::f32) { + static const MCPhysReg FPR32List[] = { + RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F, + RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F, RISCV::F1_F, + RISCV::F2_F, RISCV::F3_F, RISCV::F4_F, RISCV::F5_F, RISCV::F6_F, + RISCV::F7_F, RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F}; + if (unsigned Reg = State.AllocateReg(FPR32List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::f64) { + static const MCPhysReg FPR64List[] = { + RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D, + RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D, RISCV::F1_D, + RISCV::F2_D, RISCV::F3_D, RISCV::F4_D, RISCV::F5_D, RISCV::F6_D, + RISCV::F7_D, RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D}; + if (unsigned Reg = State.AllocateReg(FPR64List)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } + + if (LocVT == MVT::i32 || LocVT == MVT::f32) { + unsigned Offset4 = State.AllocateStack(4, 4); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo)); + return false; + } + + if (LocVT == MVT::i64 || LocVT == MVT::f64) { + unsigned Offset5 = State.AllocateStack(8, 8); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo)); + return false; + } + + return true; // CC didn't match. +} + // Transform physical registers into virtual registers. SDValue RISCVTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, @@ -1809,7 +1892,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false); + + if (CallConv == CallingConv::Fast) + CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC); + else + analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -1877,8 +1964,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( // ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures // offsets to even-numbered registered remain 2*XLEN-aligned. if (Idx % 2) { - FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, - true); + MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, true); VarArgsSaveSize += XLenInBytes; } @@ -1886,7 +1972,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( // to the vararg save area. for (unsigned I = Idx; I < ArgRegs.size(); ++I, VaArgOffset += XLenInBytes) { - const unsigned Reg = RegInfo.createVirtualRegister(RC); + const Register Reg = RegInfo.createVirtualRegister(RC); RegInfo.addLiveIn(ArgRegs[I], Reg); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT); FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true); @@ -1920,7 +2006,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( auto &Callee = CLI.Callee; auto CalleeCC = CLI.CallConv; - auto IsVarArg = CLI.IsVarArg; auto &Outs = CLI.Outs; auto &Caller = MF.getFunction(); auto CallerCC = Caller.getCallingConv(); @@ -1937,10 +2022,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( if (Caller.hasFnAttribute("interrupt")) return false; - // Do not tail call opt functions with varargs. - if (IsVarArg) - return false; - // Do not tail call opt if the stack is used to pass parameters. if (CCInfo.getNextStackOffset() != 0) return false; @@ -2015,7 +2096,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI); + + if (CallConv == CallingConv::Fast) + ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC); + else + analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI); // Check if it's really possible to do a tail call. if (IsTailCall) @@ -2057,7 +2142,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); // Copy argument values to their designated locations. - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; SmallVector<SDValue, 8> MemOpChains; SDValue StackPtr; for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { @@ -2074,7 +2159,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Lo = SplitF64.getValue(0); SDValue Hi = SplitF64.getValue(1); - unsigned RegLo = VA.getLocReg(); + Register RegLo = VA.getLocReg(); RegsToPass.push_back(std::make_pair(RegLo, Lo)); if (RegLo == RISCV::X17) { @@ -2087,7 +2172,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo())); } else { // Second half of f64 is passed in another GPR. - unsigned RegHigh = RegLo + 1; + assert(RegLo < RISCV::X31 && "Invalid register pair"); + Register RegHigh = RegLo + 1; RegsToPass.push_back(std::make_pair(RegHigh, Hi)); } continue; @@ -2302,8 +2388,9 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, DAG.getVTList(MVT::i32, MVT::i32), Val); SDValue Lo = SplitF64.getValue(0); SDValue Hi = SplitF64.getValue(1); - unsigned RegLo = VA.getLocReg(); - unsigned RegHi = RegLo + 1; + Register RegLo = VA.getLocReg(); + assert(RegLo < RISCV::X31 && "Invalid register pair"); + Register RegHi = RegLo + 1; Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(RegLo, MVT::i32)); @@ -2397,6 +2484,27 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { return nullptr; } +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +RISCVTargetLowering::ConstraintType +RISCVTargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'f': + return C_RegisterClass; + case 'I': + case 'J': + case 'K': + return C_Immediate; + case 'A': + return C_Memory; + } + } + return TargetLowering::getConstraintType(Constraint); +} + std::pair<unsigned, const TargetRegisterClass *> RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, @@ -2407,14 +2515,125 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, switch (Constraint[0]) { case 'r': return std::make_pair(0U, &RISCV::GPRRegClass); + case 'f': + if (Subtarget.hasStdExtF() && VT == MVT::f32) + return std::make_pair(0U, &RISCV::FPR32RegClass); + if (Subtarget.hasStdExtD() && VT == MVT::f64) + return std::make_pair(0U, &RISCV::FPR64RegClass); + break; default: break; } } + // Clang will correctly decode the usage of register name aliases into their + // official names. However, other frontends like `rustc` do not. This allows + // users of these frontends to use the ABI names for registers in LLVM-style + // register constraints. + Register XRegFromAlias = StringSwitch<Register>(Constraint.lower()) + .Case("{zero}", RISCV::X0) + .Case("{ra}", RISCV::X1) + .Case("{sp}", RISCV::X2) + .Case("{gp}", RISCV::X3) + .Case("{tp}", RISCV::X4) + .Case("{t0}", RISCV::X5) + .Case("{t1}", RISCV::X6) + .Case("{t2}", RISCV::X7) + .Cases("{s0}", "{fp}", RISCV::X8) + .Case("{s1}", RISCV::X9) + .Case("{a0}", RISCV::X10) + .Case("{a1}", RISCV::X11) + .Case("{a2}", RISCV::X12) + .Case("{a3}", RISCV::X13) + .Case("{a4}", RISCV::X14) + .Case("{a5}", RISCV::X15) + .Case("{a6}", RISCV::X16) + .Case("{a7}", RISCV::X17) + .Case("{s2}", RISCV::X18) + .Case("{s3}", RISCV::X19) + .Case("{s4}", RISCV::X20) + .Case("{s5}", RISCV::X21) + .Case("{s6}", RISCV::X22) + .Case("{s7}", RISCV::X23) + .Case("{s8}", RISCV::X24) + .Case("{s9}", RISCV::X25) + .Case("{s10}", RISCV::X26) + .Case("{s11}", RISCV::X27) + .Case("{t3}", RISCV::X28) + .Case("{t4}", RISCV::X29) + .Case("{t5}", RISCV::X30) + .Case("{t6}", RISCV::X31) + .Default(RISCV::NoRegister); + if (XRegFromAlias != RISCV::NoRegister) + return std::make_pair(XRegFromAlias, &RISCV::GPRRegClass); + + // Since TargetLowering::getRegForInlineAsmConstraint uses the name of the + // TableGen record rather than the AsmName to choose registers for InlineAsm + // constraints, plus we want to match those names to the widest floating point + // register type available, manually select floating point registers here. + // + // The second case is the ABI name of the register, so that frontends can also + // use the ABI names in register constraint lists. + if (Subtarget.hasStdExtF() || Subtarget.hasStdExtD()) { + std::pair<Register, Register> FReg = + StringSwitch<std::pair<Register, Register>>(Constraint.lower()) + .Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D}) + .Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D}) + .Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D}) + .Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D}) + .Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D}) + .Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D}) + .Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D}) + .Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D}) + .Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D}) + .Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D}) + .Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D}) + .Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D}) + .Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D}) + .Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D}) + .Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D}) + .Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D}) + .Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D}) + .Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D}) + .Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D}) + .Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D}) + .Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D}) + .Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D}) + .Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D}) + .Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D}) + .Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D}) + .Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D}) + .Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D}) + .Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D}) + .Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D}) + .Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D}) + .Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D}) + .Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D}) + .Default({RISCV::NoRegister, RISCV::NoRegister}); + if (FReg.first != RISCV::NoRegister) + return Subtarget.hasStdExtD() + ? std::make_pair(FReg.second, &RISCV::FPR64RegClass) + : std::make_pair(FReg.first, &RISCV::FPR32RegClass); + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } +unsigned +RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const { + // Currently only support length 1 constraints. + if (ConstraintCode.size() == 1) { + switch (ConstraintCode[0]) { + case 'A': + return InlineAsm::Constraint_A; + default: + break; + } + } + + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); +} + void RISCVTargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { @@ -2619,3 +2838,13 @@ unsigned RISCVTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { return RISCV::X11; } + +bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const { + // Return false to suppress the unnecessary extensions if the LibCall + // arguments or return value is f32 type for LP64 ABI. + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (ABI == RISCVABI::ABI_LP64 && (Type == MVT::f32)) + return false; + + return true; +} diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h index 17db03bbb69e..18fc7350bbbf 100644 --- a/lib/Target/RISCV/RISCVISelLowering.h +++ b/lib/Target/RISCV/RISCVISelLowering.h @@ -92,6 +92,10 @@ public: // This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; + + unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override; + std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; @@ -141,6 +145,8 @@ public: unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + bool shouldExtendTypeInLibCall(EVT Type) const override; + private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl<ISD::InputArg> &Ins, diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp index 99c8d2ef73de..084839299530 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -14,6 +14,7 @@ #include "RISCV.h" #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" +#include "Utils/RISCVMatInt.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -28,8 +29,9 @@ using namespace llvm; -RISCVInstrInfo::RISCVInstrInfo() - : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {} +RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI) + : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), + STI(STI) {} unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { @@ -156,24 +158,43 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0); } -void RISCVInstrInfo::movImm32(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, unsigned DstReg, uint64_t Val, - MachineInstr::MIFlag Flag) const { - assert(isInt<32>(Val) && "Can only materialize 32-bit constants"); +void RISCVInstrInfo::movImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DstReg, uint64_t Val, + MachineInstr::MIFlag Flag) const { + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + bool IsRV64 = MF->getSubtarget<RISCVSubtarget>().is64Bit(); + Register SrcReg = RISCV::X0; + Register Result = MRI.createVirtualRegister(&RISCV::GPRRegClass); + unsigned Num = 0; + + if (!IsRV64 && !isInt<32>(Val)) + report_fatal_error("Should only materialize 32-bit constants for RV32"); + + RISCVMatInt::InstSeq Seq; + RISCVMatInt::generateInstSeq(Val, IsRV64, Seq); + assert(Seq.size() > 0); - // TODO: If the value can be materialized using only one instruction, only - // insert a single instruction. + for (RISCVMatInt::Inst &Inst : Seq) { + // Write the final result to DstReg if it's the last instruction in the Seq. + // Otherwise, write the result to the temp register. + if (++Num == Seq.size()) + Result = DstReg; - uint64_t Hi20 = ((Val + 0x800) >> 12) & 0xfffff; - uint64_t Lo12 = SignExtend64<12>(Val); - BuildMI(MBB, MBBI, DL, get(RISCV::LUI), DstReg) - .addImm(Hi20) - .setMIFlag(Flag); - BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg) - .addReg(DstReg, RegState::Kill) - .addImm(Lo12) - .setMIFlag(Flag); + if (Inst.Opc == RISCV::LUI) { + BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result) + .addImm(Inst.Imm) + .setMIFlag(Flag); + } else { + BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result) + .addReg(SrcReg, RegState::Kill) + .addImm(Inst.Imm) + .setMIFlag(Flag); + } + // Only the first instruction has X0 as its source. + SrcReg = Result; + } } // The contents of values added to Cond are not examined outside of @@ -372,7 +393,7 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // FIXME: A virtual register must be used initially, as the register // scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch // uses the same workaround). - unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); + Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); auto II = MBB.end(); MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg) @@ -466,3 +487,58 @@ bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { } return MI.isAsCheapAsAMove(); } + +bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, + StringRef &ErrInfo) const { + const MCInstrInfo *MCII = STI.getInstrInfo(); + MCInstrDesc const &Desc = MCII->get(MI.getOpcode()); + + for (auto &OI : enumerate(Desc.operands())) { + unsigned OpType = OI.value().OperandType; + if (OpType >= RISCVOp::OPERAND_FIRST_RISCV_IMM && + OpType <= RISCVOp::OPERAND_LAST_RISCV_IMM) { + const MachineOperand &MO = MI.getOperand(OI.index()); + if (MO.isImm()) { + int64_t Imm = MO.getImm(); + bool Ok; + switch (OpType) { + default: + llvm_unreachable("Unexpected operand type"); + case RISCVOp::OPERAND_UIMM4: + Ok = isUInt<4>(Imm); + break; + case RISCVOp::OPERAND_UIMM5: + Ok = isUInt<5>(Imm); + break; + case RISCVOp::OPERAND_UIMM12: + Ok = isUInt<12>(Imm); + break; + case RISCVOp::OPERAND_SIMM12: + Ok = isInt<12>(Imm); + break; + case RISCVOp::OPERAND_SIMM13_LSB0: + Ok = isShiftedInt<12, 1>(Imm); + break; + case RISCVOp::OPERAND_UIMM20: + Ok = isUInt<20>(Imm); + break; + case RISCVOp::OPERAND_SIMM21_LSB0: + Ok = isShiftedInt<20, 1>(Imm); + break; + case RISCVOp::OPERAND_UIMMLOG2XLEN: + if (STI.getTargetTriple().isArch64Bit()) + Ok = isUInt<6>(Imm); + else + Ok = isUInt<5>(Imm); + break; + } + if (!Ok) { + ErrInfo = "Invalid immediate"; + return false; + } + } + } + } + + return true; +} diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h index ff098e660d19..d3ae04aefe04 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.h +++ b/lib/Target/RISCV/RISCVInstrInfo.h @@ -21,10 +21,12 @@ namespace llvm { +class RISCVSubtarget; + class RISCVInstrInfo : public RISCVGenInstrInfo { public: - RISCVInstrInfo(); + explicit RISCVInstrInfo(RISCVSubtarget &STI); unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -46,10 +48,10 @@ public: int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - // Materializes the given int32 Val into DstReg. - void movImm32(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, unsigned DstReg, uint64_t Val, - MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const; + // Materializes the given integer Val into DstReg. + void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DstReg, uint64_t Val, + MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const; unsigned getInstSizeInBytes(const MachineInstr &MI) const override; @@ -80,6 +82,12 @@ public: int64_t BrOffset) const override; bool isAsCheapAsAMove(const MachineInstr &MI) const override; + + bool verifyInstruction(const MachineInstr &MI, + StringRef &ErrInfo) const override; + +protected: + const RISCVSubtarget &STI; }; } #endif diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td index 69bde15f1218..db2ecc49d14e 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.td +++ b/lib/Target/RISCV/RISCVInstrInfo.td @@ -69,6 +69,12 @@ class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass { let DiagnosticType = !strconcat("Invalid", Name); } +def ImmZeroAsmOperand : AsmOperandClass { + let Name = "ImmZero"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = !strconcat("Invalid", Name); +} + class SImmAsmOperand<int width, string suffix = ""> : ImmAsmOperand<"S", width, suffix> { } @@ -87,6 +93,8 @@ def fencearg : Operand<XLenVT> { let ParserMatchClass = FenceArg; let PrintMethod = "printFenceArg"; let DecoderMethod = "decodeUImmOperand<4>"; + let OperandType = "OPERAND_UIMM4"; + let OperandNamespace = "RISCVOp"; } def UImmLog2XLenAsmOperand : AsmOperandClass { @@ -111,11 +119,15 @@ def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{ return isUInt<6>(Imm); return isUInt<5>(Imm); }]; + let OperandType = "OPERAND_UIMMLOG2XLEN"; + let OperandNamespace = "RISCVOp"; } def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> { let ParserMatchClass = UImmAsmOperand<5>; let DecoderMethod = "decodeUImmOperand<5>"; + let OperandType = "OPERAND_UIMM5"; + let OperandNamespace = "RISCVOp"; } def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> { @@ -128,6 +140,8 @@ def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> { return isInt<12>(Imm); return MCOp.isBareSymbolRef(); }]; + let OperandType = "OPERAND_SIMM12"; + let OperandNamespace = "RISCVOp"; } // A 13-bit signed immediate where the least significant bit is zero. @@ -141,6 +155,8 @@ def simm13_lsb0 : Operand<OtherVT> { return isShiftedInt<12, 1>(Imm); return MCOp.isBareSymbolRef(); }]; + let OperandType = "OPERAND_SIMM13_LSB0"; + let OperandNamespace = "RISCVOp"; } class UImm20Operand : Operand<XLenVT> { @@ -152,6 +168,8 @@ class UImm20Operand : Operand<XLenVT> { return isUInt<20>(Imm); return MCOp.isBareSymbolRef(); }]; + let OperandType = "OPERAND_UIMM20"; + let OperandNamespace = "RISCVOp"; } def uimm20_lui : UImm20Operand { @@ -176,6 +194,8 @@ def simm21_lsb0_jal : Operand<OtherVT> { return isShiftedInt<20, 1>(Imm); return MCOp.isBareSymbolRef(); }]; + let OperandType = "OPERAND_SIMM21_LSB0"; + let OperandNamespace = "RISCVOp"; } def BareSymbol : AsmOperandClass { @@ -224,6 +244,8 @@ def csr_sysreg : Operand<XLenVT> { let ParserMatchClass = CSRSystemRegister; let PrintMethod = "printCSRSystemRegister"; let DecoderMethod = "decodeUImmOperand<12>"; + let OperandType = "OPERAND_UIMM12"; + let OperandNamespace = "RISCVOp"; } // A parameterized register class alternative to i32imm/i64imm from Target.td. diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td index b768c9347b38..38ba3f9fb24e 100644 --- a/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/lib/Target/RISCV/RISCVInstrInfoA.td @@ -12,14 +12,32 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// Operand and SDNode transformation definitions. +//===----------------------------------------------------------------------===// + +// A parse method for (${gpr}) or 0(${gpr}), where the 0 is be silently ignored. +// Used for GNU as Compatibility. +def AtomicMemOpOperand : AsmOperandClass { + let Name = "AtomicMemOpOperand"; + let RenderMethod = "addRegOperands"; + let PredicateMethod = "isReg"; + let ParserMethod = "parseAtomicMemOp"; +} + +def GPRMemAtomic : RegisterOperand<GPR> { + let ParserMatchClass = AtomicMemOpOperand; + let PrintMethod = "printAtomicMemOp"; +} + +//===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<0b00010, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPR:$rs1), - opcodestr, "$rd, (${rs1})"> { + (outs GPR:$rd), (ins GPRMemAtomic:$rs1), + opcodestr, "$rd, $rs1"> { let rs2 = 0; } @@ -33,8 +51,8 @@ multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> { let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2), - opcodestr, "$rd, $rs2, (${rs1})">; + (outs GPR:$rd), (ins GPRMemAtomic:$rs1, GPR:$rs2), + opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> { def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>; @@ -196,12 +214,12 @@ class PseudoMaskedAMOUMinUMax } class PseudoMaskedAMOPat<Intrinsic intrin, Pseudo AMOInst> - : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering), + : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering), (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering)>; class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst> : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt, - imm:$ordering), + timm:$ordering), (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt, imm:$ordering)>; @@ -270,7 +288,7 @@ def PseudoMaskedCmpXchg32 } def : Pat<(int_riscv_masked_cmpxchg_i32 - GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering), + GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering), (PseudoMaskedCmpXchg32 GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>; @@ -347,7 +365,7 @@ def PseudoCmpXchg64 : PseudoCmpXchg; defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64>; def : Pat<(int_riscv_masked_cmpxchg_i64 - GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering), + GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering), (PseudoMaskedCmpXchg32 GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>; } // Predicates = [HasStdExtA, IsRV64] diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td index 94477341eea7..fa0050f107b2 100644 --- a/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/lib/Target/RISCV/RISCVInstrInfoC.td @@ -61,6 +61,11 @@ def simm6nonzero : Operand<XLenVT>, }]; } +def immzero : Operand<XLenVT>, + ImmLeaf<XLenVT, [{return (Imm == 0);}]> { + let ParserMatchClass = ImmZeroAsmOperand; +} + def CLUIImmAsmOperand : AsmOperandClass { let Name = "CLUIImm"; let RenderMethod = "addImmOperands"; @@ -132,7 +137,8 @@ def uimm8_lsb000 : Operand<XLenVT>, } // A 9-bit signed immediate where the least significant bit is zero. -def simm9_lsb0 : Operand<OtherVT> { +def simm9_lsb0 : Operand<OtherVT>, + ImmLeaf<XLenVT, [{return isShiftedInt<8, 1>(Imm);}]> { let ParserMatchClass = SImmAsmOperand<9, "Lsb0">; let EncoderMethod = "getImmOpValueAsr1"; let DecoderMethod = "decodeSImmOperandAndLsl1<9>"; @@ -191,7 +197,8 @@ def simm10_lsb0000nonzero : Operand<XLenVT>, } // A 12-bit signed immediate where the least significant bit is zero. -def simm12_lsb0 : Operand<XLenVT> { +def simm12_lsb0 : Operand<XLenVT>, + ImmLeaf<XLenVT, [{return isShiftedInt<11, 1>(Imm);}]> { let ParserMatchClass = SImmAsmOperand<12, "Lsb0">; let EncoderMethod = "getImmOpValueAsr1"; let DecoderMethod = "decodeSImmOperandAndLsl1<12>"; @@ -344,7 +351,10 @@ def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000> { } let rd = 0, imm = 0, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">; +def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", ""> +{ + let Inst{6-2} = 0; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), @@ -354,6 +364,15 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), let Inst{6-2} = imm{4-0}; } +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb), + (ins GPRX0:$rd, immzero:$imm), + "c.addi", "$rd, $imm"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = 0; + let isAsmParserOnly = 1; +} + let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1, DecoderNamespace = "RISCV32Only_", Defs = [X1], Predicates = [HasStdExtC, IsRV32] in @@ -523,6 +542,105 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> { } // Predicates = [HasStdExtC] //===----------------------------------------------------------------------===// +// HINT Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0, + mayStore = 0 in +{ + +let rd = 0 in +def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm), + "c.nop", "$imm"> { + let Inst{6-2} = imm{4-0}; + let DecoderMethod = "decodeRVCInstrSImm"; +} + +// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6. +def C_ADDI_HINT_X0 : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb), + (ins GPRX0:$rd, simm6nonzero:$imm), + "c.addi", "$rd, $imm"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = imm{4-0}; + let isAsmParserOnly = 1; +} + +def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), + (ins GPRNoX0:$rd, immzero:$imm), + "c.addi", "$rd, $imm"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = 0; + let isAsmParserOnly = 1; +} + +def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm), + "c.li", "$rd, $imm"> { + let Inst{6-2} = imm{4-0}; + let Inst{11-7} = 0; + let DecoderMethod = "decodeRVCInstrRdSImm"; +} + +def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd), + (ins c_lui_imm:$imm), + "c.lui", "$rd, $imm"> { + let Inst{6-2} = imm{4-0}; + let Inst{11-7} = 0; + let DecoderMethod = "decodeRVCInstrRdSImm"; +} + +def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2), + "c.mv", "$rs1, $rs2"> +{ + let Inst{11-7} = 0; + let DecoderMethod = "decodeRVCInstrRdRs2"; +} + +def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rs1_wb), + (ins GPRX0:$rs1, GPRNoX0:$rs2), + "c.add", "$rs1, $rs2"> { + let Constraints = "$rs1 = $rs1_wb"; + let Inst{11-7} = 0; + let DecoderMethod = "decodeRVCInstrRdRs1Rs2"; +} + +def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb), + (ins GPRX0:$rd, uimmlog2xlennonzero:$imm), + "c.slli" ,"$rd, $imm"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = imm{4-0}; + let Inst{11-7} = 0; + let DecoderMethod = "decodeRVCInstrRdRs1UImm"; +} + +def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd), + "c.slli64" ,"$rd"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = 0; + let Inst{12} = 0; +} + +def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb), + (ins GPRC:$rd), + "c.srli64", "$rd"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = 0; + let Inst{11-10} = 0; + let Inst{12} = 0; +} + +def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb), + (ins GPRC:$rd), + "c.srai64", "$rd"> { + let Constraints = "$rd = $rd_wb"; + let Inst{6-2} = 0; + let Inst{11-10} = 1; + let Inst{12} = 0; +} + +} // Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0, + // mayStore = 0 + +//===----------------------------------------------------------------------===// // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td index 032642942f2b..3b73c865ea17 100644 --- a/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/lib/Target/RISCV/RISCVInstrInfoF.td @@ -227,6 +227,12 @@ def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>; def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>; def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>; +// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them +// zero weight. +def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>; +def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>; +def : InstAlias<"fssr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 0>; + def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>; def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>; def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>; diff --git a/lib/Target/RISCV/RISCVInstructionSelector.cpp b/lib/Target/RISCV/RISCVInstructionSelector.cpp new file mode 100644 index 000000000000..5bd09a546114 --- /dev/null +++ b/lib/Target/RISCV/RISCVInstructionSelector.cpp @@ -0,0 +1,103 @@ +//===-- RISCVInstructionSelector.cpp -----------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// RISCV. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "RISCVRegisterBankInfo.h" +#include "RISCVSubtarget.h" +#include "RISCVTargetMachine.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "riscv-isel" + +using namespace llvm; + +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "RISCVGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + +namespace { + +class RISCVInstructionSelector : public InstructionSelector { +public: + RISCVInstructionSelector(const RISCVTargetMachine &TM, + const RISCVSubtarget &STI, + const RISCVRegisterBankInfo &RBI); + + bool select(MachineInstr &I) override; + static const char *getName() { return DEBUG_TYPE; } + +private: + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + + const RISCVSubtarget &STI; + const RISCVInstrInfo &TII; + const RISCVRegisterInfo &TRI; + const RISCVRegisterBankInfo &RBI; + + // FIXME: This is necessary because DAGISel uses "Subtarget->" and GlobalISel + // uses "STI." in the code generated by TableGen. We need to unify the name of + // Subtarget variable. + const RISCVSubtarget *Subtarget = &STI; + +#define GET_GLOBALISEL_PREDICATES_DECL +#include "RISCVGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL + +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "RISCVGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "RISCVGenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL + +RISCVInstructionSelector::RISCVInstructionSelector( + const RISCVTargetMachine &TM, const RISCVSubtarget &STI, + const RISCVRegisterBankInfo &RBI) + : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), + +#define GET_GLOBALISEL_PREDICATES_INIT +#include "RISCVGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "RISCVGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} + +bool RISCVInstructionSelector::select(MachineInstr &I) { + + if (!isPreISelGenericOpcode(I.getOpcode())) { + // Certain non-generic instructions also need some special handling. + return true; + } + + if (selectImpl(I, *CoverageInfo)) + return true; + + return false; +} + +namespace llvm { +InstructionSelector * +createRISCVInstructionSelector(const RISCVTargetMachine &TM, + RISCVSubtarget &Subtarget, + RISCVRegisterBankInfo &RBI) { + return new RISCVInstructionSelector(TM, Subtarget, RBI); +} +} // end namespace llvm diff --git a/lib/Target/RISCV/RISCVLegalizerInfo.cpp b/lib/Target/RISCV/RISCVLegalizerInfo.cpp new file mode 100644 index 000000000000..c92f4a3ee17b --- /dev/null +++ b/lib/Target/RISCV/RISCVLegalizerInfo.cpp @@ -0,0 +1,23 @@ +//===-- RISCVLegalizerInfo.cpp ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for RISCV. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "RISCVLegalizerInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" + +using namespace llvm; + +RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) { + computeTables(); +} diff --git a/lib/Target/RISCV/RISCVLegalizerInfo.h b/lib/Target/RISCV/RISCVLegalizerInfo.h new file mode 100644 index 000000000000..f2c2b9a3fd46 --- /dev/null +++ b/lib/Target/RISCV/RISCVLegalizerInfo.h @@ -0,0 +1,28 @@ +//===-- RISCVLegalizerInfo.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for RISCV. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H +#define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class RISCVSubtarget; + +/// This class provides the information for the target register banks. +class RISCVLegalizerInfo : public LegalizerInfo { +public: + RISCVLegalizerInfo(const RISCVSubtarget &ST); +}; +} // end namespace llvm +#endif diff --git a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index 82b1209cb8e7..4c9013aa1e23 100644 --- a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -45,7 +45,7 @@ struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass { bool detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI); void foldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI, MachineInstr &Tail, int64_t Offset); - bool matchLargeOffset(MachineInstr &TailAdd, unsigned GSReg, int64_t &Offset); + bool matchLargeOffset(MachineInstr &TailAdd, Register GSReg, int64_t &Offset); RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {} MachineFunctionProperties getRequiredProperties() const override { @@ -85,7 +85,7 @@ bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI, HiLUI.getOperand(1).getOffset() != 0 || !MRI->hasOneUse(HiLUI.getOperand(0).getReg())) return false; - unsigned HiLuiDestReg = HiLUI.getOperand(0).getReg(); + Register HiLuiDestReg = HiLUI.getOperand(0).getReg(); LoADDI = MRI->use_begin(HiLuiDestReg)->getParent(); if (LoADDI->getOpcode() != RISCV::ADDI || LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO || @@ -132,12 +132,12 @@ void RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &HiLUI, // \ / // TailAdd: add vreg4, vreg2, voff bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd, - unsigned GAReg, + Register GAReg, int64_t &Offset) { assert((TailAdd.getOpcode() == RISCV::ADD) && "Expected ADD instruction!"); - unsigned Rs = TailAdd.getOperand(1).getReg(); - unsigned Rt = TailAdd.getOperand(2).getReg(); - unsigned Reg = Rs == GAReg ? Rt : Rs; + Register Rs = TailAdd.getOperand(1).getReg(); + Register Rt = TailAdd.getOperand(2).getReg(); + Register Reg = Rs == GAReg ? Rt : Rs; // Can't fold if the register has more than one use. if (!MRI->hasOneUse(Reg)) @@ -178,7 +178,7 @@ bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd, bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI) { - unsigned DestReg = LoADDI.getOperand(0).getReg(); + Register DestReg = LoADDI.getOperand(0).getReg(); assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI"); // LoADDI has only one use. MachineInstr &Tail = *MRI->use_begin(DestReg)->getParent(); @@ -232,7 +232,7 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI, return false; // Register defined by LoADDI should be used in the base part of the // load\store instruction. Otherwise, no folding possible. - unsigned BaseAddrReg = Tail.getOperand(1).getReg(); + Register BaseAddrReg = Tail.getOperand(1).getReg(); if (DestReg != BaseAddrReg) return false; MachineOperand &TailImmOp = Tail.getOperand(2); diff --git a/lib/Target/RISCV/RISCVRegisterBankInfo.cpp b/lib/Target/RISCV/RISCVRegisterBankInfo.cpp new file mode 100644 index 000000000000..bd3b95a98b9f --- /dev/null +++ b/lib/Target/RISCV/RISCVRegisterBankInfo.cpp @@ -0,0 +1,26 @@ +//===-- RISCVRegisterBankInfo.cpp -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for RISCV. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "RISCVRegisterBankInfo.h" +#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +#define GET_TARGET_REGBANK_IMPL +#include "RISCVGenRegisterBank.inc" + +using namespace llvm; + +RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI) + : RISCVGenRegisterBankInfo() {} diff --git a/lib/Target/RISCV/RISCVRegisterBankInfo.h b/lib/Target/RISCV/RISCVRegisterBankInfo.h new file mode 100644 index 000000000000..05fac992734d --- /dev/null +++ b/lib/Target/RISCV/RISCVRegisterBankInfo.h @@ -0,0 +1,37 @@ +//===-- RISCVRegisterBankInfo.h ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for RISCV. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H +#define LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +#define GET_REGBANK_DECLARATIONS +#include "RISCVGenRegisterBank.inc" + +namespace llvm { + +class TargetRegisterInfo; + +class RISCVGenRegisterBankInfo : public RegisterBankInfo { +protected: +#define GET_TARGET_REGBANK_CLASS +#include "RISCVGenRegisterBank.inc" +}; + +/// This class provides the information for the target register banks. +class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo { +public: + RISCVRegisterBankInfo(const TargetRegisterInfo &TRI); +}; +} // end namespace llvm +#endif diff --git a/lib/Target/RISCV/RISCVRegisterBanks.td b/lib/Target/RISCV/RISCVRegisterBanks.td new file mode 100644 index 000000000000..400b65a1bf9a --- /dev/null +++ b/lib/Target/RISCV/RISCVRegisterBanks.td @@ -0,0 +1,13 @@ +//=-- RISCVRegisterBank.td - Describe the RISCV Banks --------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers: X. +def GPRRegBank : RegisterBank<"GPRB", [GPR]>; diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp index e6a126e3e513..66557687c0b6 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -26,6 +26,15 @@ using namespace llvm; +static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive"); +static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive"); +static_assert(RISCV::F1_F == RISCV::F0_F + 1, "Register list not consecutive"); +static_assert(RISCV::F31_F == RISCV::F0_F + 31, + "Register list not consecutive"); +static_assert(RISCV::F1_D == RISCV::F0_D + 1, "Register list not consecutive"); +static_assert(RISCV::F31_D == RISCV::F0_D + 31, + "Register list not consecutive"); + RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode) : RISCVGenRegisterInfo(RISCV::X1, /*DwarfFlavour*/0, /*EHFlavor*/0, /*PC*/0, HwMode) {} @@ -109,8 +118,8 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(isInt<32>(Offset) && "Int32 expected"); // The offset won't fit in an immediate, so use a scratch register instead // Modify Offset and FrameReg appropriately - unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); - TII->movImm32(MBB, II, DL, ScratchReg, Offset); + Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); + TII->movImm(MBB, II, DL, ScratchReg, Offset); BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg) .addReg(FrameReg) .addReg(ScratchReg, RegState::Kill); diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h index 4f339475508f..56a50fe6ddc0 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/lib/Target/RISCV/RISCVRegisterInfo.h @@ -52,6 +52,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { bool trackLivenessAfterRegAlloc(const MachineFunction &) const override { return true; } + + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override { + return &RISCV::GPRRegClass; + } }; } diff --git a/lib/Target/RISCV/RISCVRegisterInfo.td b/lib/Target/RISCV/RISCVRegisterInfo.td index 79f8ab12f6c0..82b37afd0805 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/lib/Target/RISCV/RISCVRegisterInfo.td @@ -101,6 +101,12 @@ def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>; } +def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> { + let RegInfos = RegInfoByHwMode< + [RV32, RV64, DefaultMode], + [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>; +} + // The order of registers represents the preferred allocation sequence. // Registers are listed in the order caller-save, callee-save, specials. def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add @@ -159,41 +165,41 @@ def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> { // Floating point registers let RegAltNameIndices = [ABIRegAltName] in { - def F0_32 : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>; - def F1_32 : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>; - def F2_32 : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>; - def F3_32 : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>; - def F4_32 : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>; - def F5_32 : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>; - def F6_32 : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>; - def F7_32 : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>; - def F8_32 : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>; - def F9_32 : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>; - def F10_32 : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>; - def F11_32 : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>; - def F12_32 : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>; - def F13_32 : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>; - def F14_32 : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>; - def F15_32 : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>; - def F16_32 : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>; - def F17_32 : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>; - def F18_32 : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>; - def F19_32 : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>; - def F20_32 : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>; - def F21_32 : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>; - def F22_32 : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>; - def F23_32 : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>; - def F24_32 : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>; - def F25_32 : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>; - def F26_32 : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>; - def F27_32 : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>; - def F28_32 : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>; - def F29_32 : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>; - def F30_32 : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>; - def F31_32 : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>; + def F0_F : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>; + def F1_F : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>; + def F2_F : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>; + def F3_F : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>; + def F4_F : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>; + def F5_F : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>; + def F6_F : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>; + def F7_F : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>; + def F8_F : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>; + def F9_F : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>; + def F10_F : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>; + def F11_F : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>; + def F12_F : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>; + def F13_F : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>; + def F14_F : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>; + def F15_F : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>; + def F16_F : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>; + def F17_F : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>; + def F18_F : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>; + def F19_F : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>; + def F20_F : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>; + def F21_F : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>; + def F22_F : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>; + def F23_F : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>; + def F24_F : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>; + def F25_F : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>; + def F26_F : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>; + def F27_F : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>; + def F28_F : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>; + def F29_F : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>; + def F30_F : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>; + def F31_F : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>; foreach Index = 0-31 in { - def F#Index#_64 : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_32")>, + def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>, DwarfRegNum<[!add(Index, 32)]>; } } @@ -201,29 +207,29 @@ let RegAltNameIndices = [ABIRegAltName] in { // The order of registers represents the preferred allocation sequence, // meaning caller-save regs are listed before callee-save. def FPR32 : RegisterClass<"RISCV", [f32], 32, (add - (sequence "F%u_32", 0, 7), - (sequence "F%u_32", 10, 17), - (sequence "F%u_32", 28, 31), - (sequence "F%u_32", 8, 9), - (sequence "F%u_32", 18, 27) + (sequence "F%u_F", 0, 7), + (sequence "F%u_F", 10, 17), + (sequence "F%u_F", 28, 31), + (sequence "F%u_F", 8, 9), + (sequence "F%u_F", 18, 27) )>; def FPR32C : RegisterClass<"RISCV", [f32], 32, (add - (sequence "F%u_32", 10, 15), - (sequence "F%u_32", 8, 9) + (sequence "F%u_F", 10, 15), + (sequence "F%u_F", 8, 9) )>; // The order of registers represents the preferred allocation sequence, // meaning caller-save regs are listed before callee-save. def FPR64 : RegisterClass<"RISCV", [f64], 64, (add - (sequence "F%u_64", 0, 7), - (sequence "F%u_64", 10, 17), - (sequence "F%u_64", 28, 31), - (sequence "F%u_64", 8, 9), - (sequence "F%u_64", 18, 27) + (sequence "F%u_D", 0, 7), + (sequence "F%u_D", 10, 17), + (sequence "F%u_D", 28, 31), + (sequence "F%u_D", 8, 9), + (sequence "F%u_D", 18, 27) )>; def FPR64C : RegisterClass<"RISCV", [f64], 64, (add - (sequence "F%u_64", 10, 15), - (sequence "F%u_64", 8, 9) + (sequence "F%u_D", 10, 15), + (sequence "F%u_D", 8, 9) )>; diff --git a/lib/Target/RISCV/RISCVSubtarget.cpp b/lib/Target/RISCV/RISCVSubtarget.cpp index 6902ed75d852..f114c6ac1925 100644 --- a/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/lib/Target/RISCV/RISCVSubtarget.cpp @@ -12,7 +12,11 @@ #include "RISCVSubtarget.h" #include "RISCV.h" +#include "RISCVCallLowering.h" #include "RISCVFrameLowering.h" +#include "RISCVLegalizerInfo.h" +#include "RISCVRegisterBankInfo.h" +#include "RISCVTargetMachine.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -47,4 +51,28 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS, StringRef ABIName, const TargetMachine &TM) : RISCVGenSubtargetInfo(TT, CPU, FS), FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)), - InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) {} + InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) { + CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering())); + Legalizer.reset(new RISCVLegalizerInfo(*this)); + + auto *RBI = new RISCVRegisterBankInfo(*getRegisterInfo()); + RegBankInfo.reset(RBI); + InstSelector.reset(createRISCVInstructionSelector( + *static_cast<const RISCVTargetMachine *>(&TM), *this, *RBI)); +} + +const CallLowering *RISCVSubtarget::getCallLowering() const { + return CallLoweringInfo.get(); +} + +InstructionSelector *RISCVSubtarget::getInstructionSelector() const { + return InstSelector.get(); +} + +const LegalizerInfo *RISCVSubtarget::getLegalizerInfo() const { + return Legalizer.get(); +} + +const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const { + return RegBankInfo.get(); +} diff --git a/lib/Target/RISCV/RISCVSubtarget.h b/lib/Target/RISCV/RISCVSubtarget.h index 106ff49f021a..7d0373a5253a 100644 --- a/lib/Target/RISCV/RISCVSubtarget.h +++ b/lib/Target/RISCV/RISCVSubtarget.h @@ -17,6 +17,10 @@ #include "RISCVISelLowering.h" #include "RISCVInstrInfo.h" #include "Utils/RISCVBaseInfo.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" @@ -38,6 +42,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { bool HasRV64 = false; bool IsRV32E = false; bool EnableLinkerRelax = false; + bool EnableRVCHintInstrs = false; unsigned XLen = 32; MVT XLenVT = MVT::i32; RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown; @@ -75,6 +80,7 @@ public: const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { return &TSInfo; } + bool enableMachineScheduler() const override { return true; } bool hasStdExtM() const { return HasStdExtM; } bool hasStdExtA() const { return HasStdExtA; } bool hasStdExtF() const { return HasStdExtF; } @@ -83,9 +89,23 @@ public: bool is64Bit() const { return HasRV64; } bool isRV32E() const { return IsRV32E; } bool enableLinkerRelax() const { return EnableLinkerRelax; } + bool enableRVCHintInstrs() const { return EnableRVCHintInstrs; } MVT getXLenVT() const { return XLenVT; } unsigned getXLen() const { return XLen; } RISCVABI::ABI getTargetABI() const { return TargetABI; } + +protected: + // GlobalISel related APIs. + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + +public: + const CallLowering *getCallLowering() const override; + InstructionSelector *getInstructionSelector() const override; + const LegalizerInfo *getLegalizerInfo() const override; + const RegisterBankInfo *getRegBankInfo() const override; }; } // End llvm namespace diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp index f4e6ed9f6284..5ffc6eda6bd7 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -17,6 +17,10 @@ #include "TargetInfo/RISCVTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -30,6 +34,7 @@ extern "C" void LLVMInitializeRISCVTarget() { RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target()); RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target()); auto PR = PassRegistry::getPassRegistry(); + initializeGlobalISel(*PR); initializeRISCVExpandPseudoPass(*PR); } @@ -58,7 +63,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - TLOF(make_unique<RISCVELFTargetObjectFile>()), + TLOF(std::make_unique<RISCVELFTargetObjectFile>()), Subtarget(TT, CPU, FS, Options.MCOptions.getABIName(), *this) { initAsmInfo(); } @@ -80,6 +85,10 @@ public: void addIRPasses() override; bool addInstSelector() override; + bool addIRTranslator() override; + bool addLegalizeMachineIR() override; + bool addRegBankSelect() override; + bool addGlobalInstructionSelect() override; void addPreEmitPass() override; void addPreEmitPass2() override; void addPreRegAlloc() override; @@ -101,6 +110,26 @@ bool RISCVPassConfig::addInstSelector() { return false; } +bool RISCVPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} + +bool RISCVPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); + return false; +} + +bool RISCVPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); + return false; +} + +bool RISCVPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); + return false; +} + void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); } void RISCVPassConfig::addPreEmitPass2() { diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h index c33c72f24319..30e475e80a01 100644 --- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h +++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h @@ -16,6 +16,7 @@ #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/SubtargetFeature.h" namespace llvm { @@ -63,6 +64,21 @@ enum { }; } // namespace RISCVII +namespace RISCVOp { +enum OperandType : unsigned { + OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET, + OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM, + OPERAND_UIMM5, + OPERAND_UIMM12, + OPERAND_SIMM12, + OPERAND_SIMM13_LSB0, + OPERAND_UIMM20, + OPERAND_SIMM21_LSB0, + OPERAND_UIMMLOG2XLEN, + OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN +}; +} // namespace RISCVOp + // Describes the predecessor/successor bits used in the FENCE instruction. namespace RISCVFenceField { enum FenceField { diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 15453ae59a4f..f6be9dd01249 100644 --- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -376,7 +376,7 @@ public: } static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) { - auto Op = make_unique<SparcOperand>(k_Token); + auto Op = std::make_unique<SparcOperand>(k_Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -386,7 +386,7 @@ public: static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind, SMLoc S, SMLoc E) { - auto Op = make_unique<SparcOperand>(k_Register); + auto Op = std::make_unique<SparcOperand>(k_Register); Op->Reg.RegNum = RegNum; Op->Reg.Kind = (SparcOperand::RegisterKind)Kind; Op->StartLoc = S; @@ -396,7 +396,7 @@ public: static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - auto Op = make_unique<SparcOperand>(k_Immediate); + auto Op = std::make_unique<SparcOperand>(k_Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -481,7 +481,7 @@ public: static std::unique_ptr<SparcOperand> CreateMEMr(unsigned Base, SMLoc S, SMLoc E) { - auto Op = make_unique<SparcOperand>(k_MemoryReg); + auto Op = std::make_unique<SparcOperand>(k_MemoryReg); Op->Mem.Base = Base; Op->Mem.OffsetReg = Sparc::G0; // always 0 Op->Mem.Off = nullptr; diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp index f1ca8e18c228..db8e7850300f 100644 --- a/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/lib/Target/Sparc/DelaySlotFiller.cpp @@ -253,7 +253,7 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, if (!MO.isReg()) continue; // skip - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MO.isDef()) { // check whether Reg is defined or used before delay slot. @@ -324,7 +324,7 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI, if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == 0) continue; if (MO.isDef()) @@ -380,7 +380,7 @@ static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI, // // After : restore <op0>, <op1>, %o[0-7] - unsigned reg = AddMI->getOperand(0).getReg(); + Register reg = AddMI->getOperand(0).getReg(); if (reg < SP::I0 || reg > SP::I7) return false; @@ -408,7 +408,7 @@ static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI, // // After : restore <op0>, <op1>, %o[0-7] - unsigned reg = OrMI->getOperand(0).getReg(); + Register reg = OrMI->getOperand(0).getReg(); if (reg < SP::I0 || reg > SP::I7) return false; @@ -446,7 +446,7 @@ static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI, // // After : restore %g0, (imm3<<10), %o[0-7] - unsigned reg = SetHiMI->getOperand(0).getReg(); + Register reg = SetHiMI->getOperand(0).getReg(); if (reg < SP::I0 || reg > SP::I7) return false; diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 88547075c5ae..c97a30e634cc 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -49,7 +49,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx, } if (IsPCRel) { - switch((unsigned)Fixup.getKind()) { + switch(Fixup.getTargetKind()) { default: llvm_unreachable("Unimplemented fixup -> relocation"); case FK_Data_1: return ELF::R_SPARC_DISP8; @@ -65,7 +65,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx, } } - switch((unsigned)Fixup.getKind()) { + switch(Fixup.getTargetKind()) { default: llvm_unreachable("Unimplemented fixup -> relocation"); case FK_Data_1: return ELF::R_SPARC_8; @@ -135,5 +135,5 @@ bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, std::unique_ptr<MCObjectTargetWriter> llvm::createSparcELFObjectWriter(bool Is64Bit, uint8_t OSABI) { - return llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI); + return std::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI); } diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index 1834a6fd861d..0f74f2bb344c 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -34,7 +34,8 @@ DisableLeafProc("disable-sparc-leaf-proc", SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST) : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, - ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {} + ST.is64Bit() ? Align(16) : Align(8), 0, + ST.is64Bit() ? Align(16) : Align(8)) {} void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp index 8cff50d19ed4..4e61c341b703 100644 --- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -231,7 +231,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){ // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to // the original GPRs. - unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); + Register GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32); SDValue Chain = SDValue(N,0); @@ -278,7 +278,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){ // Copy REG_SEQ into a GPRPair-typed VR and replace the original two // i32 VRs of inline asm with it. - unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); + Register GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32); Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index a6d440fa8aa2..4a2ba00ac6c2 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -417,7 +417,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32( if (VA.needsCustom()) { assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32); - unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); + Register VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi); SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32); @@ -445,7 +445,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32( InVals.push_back(WholeValue); continue; } - unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); + Register VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg); SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); if (VA.getLocVT() == MVT::f32) @@ -552,7 +552,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32( std::vector<SDValue> OutChains; for (; CurArgReg != ArgRegEnd; ++CurArgReg) { - unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); + Register VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); MF.getRegInfo().addLiveIn(*CurArgReg, VReg); SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32); @@ -1016,9 +1016,9 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = StringSwitch<unsigned>(RegName) +Register SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg = StringSwitch<unsigned>(RegName) .Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3) .Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7) .Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3) @@ -1438,7 +1438,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::v2i32, Expand); } // Truncating/extending stores/loads are also not supported. - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand); @@ -1805,7 +1805,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setMinFunctionAlignment(2); + setMinFunctionAlignment(Align(4)); computeRegisterProperties(Subtarget->getRegisterInfo()); } @@ -2244,7 +2244,7 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS, return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS); } case SPCC::FCC_UL : { - SDValue Mask = DAG.getTargetConstant(1, DL, Result.getValueType()); + SDValue Mask = DAG.getConstant(1, DL, Result.getValueType()); Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask); SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType()); SPCC = SPCC::ICC_NE; @@ -2277,14 +2277,14 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS, return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS); } case SPCC::FCC_LG : { - SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType()); + SDValue Mask = DAG.getConstant(3, DL, Result.getValueType()); Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask); SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType()); SPCC = SPCC::ICC_NE; return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS); } case SPCC::FCC_UE : { - SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType()); + SDValue Mask = DAG.getConstant(3, DL, Result.getValueType()); Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask); SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType()); SPCC = SPCC::ICC_E; @@ -2951,9 +2951,11 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG, SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt); SDValue Args[] = { HiLHS, LHS, HiRHS, RHS }; + TargetLowering::MakeLibCallOptions CallOptions; + CallOptions.setSExt(isSigned); SDValue MulResult = TLI.makeLibCall(DAG, RTLIB::MUL_I128, WideVT, - Args, isSigned, dl).first; + Args, CallOptions, dl).first; SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, MulResult, DAG.getIntPtrConstant(0, dl)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, @@ -3183,7 +3185,7 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const { case 'e': return C_RegisterClass; case 'I': // SIMM13 - return C_Other; + return C_Immediate; } } diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h index 8d557a4225e5..3d798cec0c16 100644 --- a/lib/Target/Sparc/SparcISelLowering.h +++ b/lib/Target/Sparc/SparcISelLowering.h @@ -98,8 +98,8 @@ namespace llvm { return MVT::i32; } - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td index 2d4f687f72d2..d18ab3b1370b 100644 --- a/lib/Target/Sparc/SparcInstr64Bit.td +++ b/lib/Target/Sparc/SparcInstr64Bit.td @@ -177,7 +177,7 @@ def LEAX_ADDri : F3_2<2, 0b000000, def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>; def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>; -def : Pat<(ctpop i64:$src), (POPCrr $src)>; +def : Pat<(i64 (ctpop i64:$src)), (POPCrr $src)>; } // Predicates = [Is64Bit] diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp index ad343fe6f80a..3d3d314a26bb 100644 --- a/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/lib/Target/Sparc/SparcInstrInfo.cpp @@ -375,8 +375,8 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineInstr *MovMI = nullptr; for (unsigned i = 0; i != numSubRegs; ++i) { - unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]); - unsigned Src = TRI->getSubReg(SrcReg, subRegIdx[i]); + Register Dst = TRI->getSubReg(DestReg, subRegIdx[i]); + Register Src = TRI->getSubReg(SrcReg, subRegIdx[i]); assert(Dst && Src && "Bad sub-register"); MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst); diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index 8474c7abffb3..73dbdc4f443e 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -516,9 +516,9 @@ let DecoderMethod = "DecodeLoadQFP" in defm LDQF : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>, Requires<[HasV9, HasHardQuad]>; -let DecoderMethod = "DecodeLoadCP" in - defm LDC : Load<"ld", 0b110000, load, CoprocRegs, i32>; -let DecoderMethod = "DecodeLoadCPPair" in +let DecoderMethod = "DecodeLoadCP" in + defm LDC : Load<"ld", 0b110000, load, CoprocRegs, i32>; +let DecoderMethod = "DecodeLoadCPPair" in defm LDDC : Load<"ldd", 0b110011, load, CoprocPair, v2i32, IIC_ldd>; let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in { @@ -1508,7 +1508,7 @@ let rs1 = 0 in def POPCrr : F3_1<2, 0b101110, (outs IntRegs:$rd), (ins IntRegs:$rs2), "popc $rs2, $rd", []>, Requires<[HasV9]>; -def : Pat<(ctpop i32:$src), +def : Pat<(i32 (ctpop i32:$src)), (POPCrr (SRLri $src, 0))>; let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp index ce11a423d10e..19a90e98db7e 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -182,9 +182,9 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) { if (MI.getOpcode() == SP::STQFri) { const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - unsigned SrcReg = MI.getOperand(2).getReg(); - unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64); - unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64); + Register SrcReg = MI.getOperand(2).getReg(); + Register SrcEvenReg = getSubReg(SrcReg, SP::sub_even64); + Register SrcOddReg = getSubReg(SrcReg, SP::sub_odd64); MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri)) .addReg(FrameReg).addImm(0).addReg(SrcEvenReg); @@ -194,9 +194,9 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Offset += 8; } else if (MI.getOpcode() == SP::LDQFri) { const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64); - unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64); + Register DestReg = MI.getOperand(0).getReg(); + Register DestEvenReg = getSubReg(DestReg, SP::sub_even64); + Register DestOddReg = getSubReg(DestReg, SP::sub_odd64); MachineInstr *LdMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg) .addReg(FrameReg).addImm(0); diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 195cff79de03..c1e3f8c36982 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -98,7 +98,7 @@ SparcTargetMachine::SparcTargetMachine( getEffectiveSparcCodeModel( CM, getEffectiveRelocModel(RM), is64bit, JIT), OL), - TLOF(make_unique<SparcELFTargetObjectFile>()), + TLOF(std::make_unique<SparcELFTargetObjectFile>()), Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) { initAsmInfo(); } @@ -133,7 +133,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this, + I = std::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this, this->is64Bit); } return I.get(); diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index a259ba3433d6..93c4ce4b5ccc 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -155,11 +155,11 @@ public: // Create particular kinds of operand. static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc, SMLoc EndLoc) { - return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc); + return std::make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc); } static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) { - auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc); + auto Op = std::make_unique<SystemZOperand>(KindToken, Loc, Loc); Op->Token.Data = Str.data(); Op->Token.Length = Str.size(); return Op; @@ -167,7 +167,7 @@ public: static std::unique_ptr<SystemZOperand> createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) { - auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc); + auto Op = std::make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc); Op->Reg.Kind = Kind; Op->Reg.Num = Num; return Op; @@ -175,7 +175,7 @@ public: static std::unique_ptr<SystemZOperand> createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) { - auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc); + auto Op = std::make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc); Op->Imm = Expr; return Op; } @@ -184,7 +184,7 @@ public: createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base, const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm, unsigned LengthReg, SMLoc StartLoc, SMLoc EndLoc) { - auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc); + auto Op = std::make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc); Op->Mem.MemKind = MemKind; Op->Mem.RegKind = RegKind; Op->Mem.Base = Base; @@ -200,7 +200,7 @@ public: static std::unique_ptr<SystemZOperand> createImmTLS(const MCExpr *Imm, const MCExpr *Sym, SMLoc StartLoc, SMLoc EndLoc) { - auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc); + auto Op = std::make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc); Op->ImmTLS.Imm = Imm; Op->ImmTLS.Sym = Sym; return Op; diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp index 8d8ba5644e10..49b6fc490336 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp @@ -162,5 +162,5 @@ unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createSystemZObjectWriter(uint8_t OSABI) { - return llvm::make_unique<SystemZObjectWriter>(OSABI); + return std::make_unique<SystemZObjectWriter>(OSABI); } diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h index 2b0f90182d7f..88cf589a3f10 100644 --- a/lib/Target/SystemZ/SystemZ.h +++ b/lib/Target/SystemZ/SystemZ.h @@ -190,7 +190,6 @@ static inline bool isImmHF(uint64_t Val) { FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM); -FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM); FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM); FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM); FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM); diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp index ef378e4ade7a..10023e9e169c 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -501,6 +501,10 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) { } break; + case TargetOpcode::FENTRY_CALL: + LowerFENTRY_CALL(*MI, Lower); + return; + case TargetOpcode::STACKMAP: LowerSTACKMAP(*MI); return; @@ -546,6 +550,22 @@ static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer, } } +void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI, + SystemZMCInstLower &Lower) { + MCContext &Ctx = MF->getContext(); + if (MF->getFunction().getFnAttribute("mnop-mcount") + .getValueAsString() == "true") { + EmitNop(Ctx, *OutStreamer, 6, getSubtargetInfo()); + return; + } + + MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__"); + const MCSymbolRefExpr *Op = + MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_PLT, Ctx); + OutStreamer->EmitInstruction(MCInstBuilder(SystemZ::BRASL) + .addReg(SystemZ::R0D).addExpr(Op), getSubtargetInfo()); +} + void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(MF->getSubtarget().getInstrInfo()); diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h index aa5d3ca78e61..d01a17c2ebe2 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.h +++ b/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -46,6 +46,7 @@ public: } private: + void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower); }; diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp index 9cbf6b320504..946eb2ba7c79 100644 --- a/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -152,7 +152,7 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) { for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { const MachineOperand &MO = MI.getOperand(I); if (MO.isReg()) { - if (unsigned MOReg = MO.getReg()) { + if (Register MOReg = MO.getReg()) { if (TRI->regsOverlap(MOReg, Reg)) { if (MO.isUse()) Ref.Use = true; @@ -378,11 +378,8 @@ bool SystemZElimCompare::adjustCCMasksForInstr( } // CC is now live after MI. - if (!ConvOpc) { - int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI); - assert(CCDef >= 0 && "Couldn't find CC set"); - MI.getOperand(CCDef).setIsDead(false); - } + if (!ConvOpc) + MI.clearRegisterDeads(SystemZ::CC); // Check if MI lies before Compare. bool BeforeCmp = false; diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp deleted file mode 100644 index 09708fb4241c..000000000000 --- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass that expands pseudo instructions into target -// instructions to allow proper scheduling and other late optimizations. This -// pass should be run after register allocation but before the post-regalloc -// scheduling pass. -// -//===----------------------------------------------------------------------===// - -#include "SystemZ.h" -#include "SystemZInstrInfo.h" -#include "SystemZSubtarget.h" -#include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -using namespace llvm; - -#define SYSTEMZ_EXPAND_PSEUDO_NAME "SystemZ pseudo instruction expansion pass" - -namespace llvm { - void initializeSystemZExpandPseudoPass(PassRegistry&); -} - -namespace { -class SystemZExpandPseudo : public MachineFunctionPass { -public: - static char ID; - SystemZExpandPseudo() : MachineFunctionPass(ID) { - initializeSystemZExpandPseudoPass(*PassRegistry::getPassRegistry()); - } - - const SystemZInstrInfo *TII; - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { return SYSTEMZ_EXPAND_PSEUDO_NAME; } - -private: - bool expandMBB(MachineBasicBlock &MBB); - bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI); - bool expandLOCRMux(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI); -}; -char SystemZExpandPseudo::ID = 0; -} - -INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo", - SYSTEMZ_EXPAND_PSEUDO_NAME, false, false) - -/// Returns an instance of the pseudo instruction expansion pass. -FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) { - return new SystemZExpandPseudo(); -} - -// MI is a load-register-on-condition pseudo instruction that could not be -// handled as a single hardware instruction. Replace it by a branch sequence. -bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI) { - MachineFunction &MF = *MBB.getParent(); - const BasicBlock *BB = MBB.getBasicBlock(); - MachineInstr &MI = *MBBI; - DebugLoc DL = MI.getDebugLoc(); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(2).getReg(); - unsigned CCValid = MI.getOperand(3).getImm(); - unsigned CCMask = MI.getOperand(4).getImm(); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); - - // Splice MBB at MI, moving the rest of the block into RestMBB. - MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB); - MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB); - RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end()); - RestMBB->transferSuccessors(&MBB); - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - RestMBB->addLiveIn(*I); - - // Create a new block MoveMBB to hold the move instruction. - MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB); - MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB); - MoveMBB->addLiveIn(SrcReg); - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MoveMBB->addLiveIn(*I); - - // At the end of MBB, create a conditional branch to RestMBB if the - // condition is false, otherwise fall through to MoveMBB. - BuildMI(&MBB, DL, TII->get(SystemZ::BRC)) - .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB); - MBB.addSuccessor(RestMBB); - MBB.addSuccessor(MoveMBB); - - // In MoveMBB, emit an instruction to move SrcReg into DestReg, - // then fall through to RestMBB. - TII->copyPhysReg(*MoveMBB, MoveMBB->end(), DL, DestReg, SrcReg, - MI.getOperand(2).isKill()); - MoveMBB->addSuccessor(RestMBB); - - NextMBBI = MBB.end(); - MI.eraseFromParent(); - return true; -} - -/// If MBBI references a pseudo instruction that should be expanded here, -/// do the expansion and return true. Otherwise return false. -bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI) { - MachineInstr &MI = *MBBI; - switch (MI.getOpcode()) { - case SystemZ::LOCRMux: - return expandLOCRMux(MBB, MBBI, NextMBBI); - default: - break; - } - return false; -} - -/// Iterate over the instructions in basic block MBB and expand any -/// pseudo instructions. Return true if anything was modified. -bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) { - bool Modified = false; - - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - while (MBBI != E) { - MachineBasicBlock::iterator NMBBI = std::next(MBBI); - Modified |= expandMI(MBB, MBBI, NMBBI); - MBBI = NMBBI; - } - - return Modified; -} - -bool SystemZExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo()); - - bool Modified = false; - for (auto &MBB : MF) - Modified |= expandMBB(MBB); - return Modified; -} - diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp index da28faebb326..0b8b6880accc 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -46,8 +46,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = { } // end anonymous namespace SystemZFrameLowering::SystemZFrameLowering() - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, - -SystemZMC::CallFrameSize, 8, + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), + -SystemZMC::CallFrameSize, Align(8), false /* StackRealignable */) { // Create a mapping from register number to save slot offset. RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); @@ -118,7 +118,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB, unsigned GPR64, bool IsImplicit) { const TargetRegisterInfo *RI = MBB.getParent()->getSubtarget().getRegisterInfo(); - unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32); + Register GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32); bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32); if (!IsLive || !IsImplicit) { MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive)); diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 9dc4512255cc..751034c2d41a 100644 --- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -346,6 +346,11 @@ public: : SelectionDAGISel(TM, OptLevel) {} bool runOnMachineFunction(MachineFunction &MF) override { + const Function &F = MF.getFunction(); + if (F.getFnAttribute("mnop-mcount").getValueAsString() == "true" && + F.getFnAttribute("fentry-call").getValueAsString() != "true") + report_fatal_error("mnop-mcount only supported with fentry-call"); + Subtarget = &MF.getSubtarget<SystemZSubtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -1146,7 +1151,7 @@ void SystemZDAGToDAGISel::loadVectorConstant( SDLoc DL(Node); SmallVector<SDValue, 2> Ops; for (unsigned OpVal : VCI.OpVals) - Ops.push_back(CurDAG->getConstant(OpVal, DL, MVT::i32)); + Ops.push_back(CurDAG->getTargetConstant(OpVal, DL, MVT::i32)); SDValue Op = CurDAG->getNode(VCI.Opcode, DL, VCI.VecVT, Ops); if (VCI.VecVT == VT.getSimpleVT()) @@ -1550,8 +1555,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { uint64_t ConstCCMask = cast<ConstantSDNode>(CCMask.getNode())->getZExtValue(); // Invert the condition. - CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node), - CCMask.getValueType()); + CCMask = CurDAG->getTargetConstant(ConstCCValid ^ ConstCCMask, + SDLoc(Node), CCMask.getValueType()); SDValue Op4 = Node->getOperand(4); SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4); diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 78820f511ab4..e0ca9da93561 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -120,9 +120,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Instructions are strings of 2-byte aligned 2-byte values. - setMinFunctionAlignment(2); + setMinFunctionAlignment(Align(2)); // For performance reasons we prefer 16-byte alignment. - setPrefFunctionAlignment(4); + setPrefFunctionAlignment(Align(16)); // Handle operations that are handled in a similar way for all types. for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE; @@ -206,6 +206,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // the default expansion. if (!Subtarget.hasFPExtension()) setOperationAction(ISD::FP_TO_UINT, VT, Expand); + + // Mirror those settings for STRICT_FP_TO_[SU]INT. Note that these all + // default to Expand, so need to be modified to Legal where appropriate. + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal); + if (Subtarget.hasFPExtension()) + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal); } } @@ -252,7 +258,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote); setOperationAction(ISD::CTLZ, MVT::i64, Legal); - // On arch13 we have native support for a 64-bit CTPOP. + // On z15 we have native support for a 64-bit CTPOP. if (Subtarget.hasMiscellaneousExtensions3()) { setOperationAction(ISD::CTPOP, MVT::i32, Promote); setOperationAction(ISD::CTPOP, MVT::i64, Legal); @@ -294,14 +300,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Handle prefetches with PFD or PFDRL. setOperationAction(ISD::PREFETCH, MVT::Other, Custom); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { // Assume by default that all vector operations need to be expanded. for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode) if (getOperationAction(Opcode, VT) == Legal) setOperationAction(Opcode, VT, Expand); // Likewise all truncating stores and extending loads. - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); @@ -327,7 +333,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, } // Handle integer vector types. - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { if (isTypeLegal(VT)) { // These operations have direct equivalents. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal); @@ -381,6 +387,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal); + + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f64, Legal); } if (Subtarget.hasVectorEnhancements2()) { @@ -392,6 +403,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal); + + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f32, Legal); } // Handle floating-point types. @@ -831,7 +847,7 @@ supportedAddressingMode(Instruction *I, bool HasVector) { } if (isa<LoadInst>(I) && I->hasOneUse()) { - auto *SingleUser = dyn_cast<Instruction>(*I->user_begin()); + auto *SingleUser = cast<Instruction>(*I->user_begin()); if (SingleUser->getParent() == I->getParent()) { if (isa<ICmpInst>(SingleUser)) { if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1))) @@ -956,7 +972,7 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const { case 'K': // Signed 16-bit constant case 'L': // Signed 20-bit displacement (on all targets we support) case 'M': // 0x7fffffff - return C_Other; + return C_Immediate; default: break; @@ -1335,7 +1351,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments( break; } - unsigned VReg = MRI.createVirtualRegister(RC); + Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(VA.getLocReg(), VReg); ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT); } else { @@ -1430,7 +1446,7 @@ static bool canUseSiblingCall(const CCState &ArgCCInfo, return false; if (!VA.isRegLoc()) return false; - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D) return false; if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError()) @@ -1674,7 +1690,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue); // Chain and glue the copies together. - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT())); @@ -2533,12 +2549,12 @@ static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) { } if (C.Opcode == SystemZISD::ICMP) return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1, - DAG.getConstant(C.ICmpType, DL, MVT::i32)); + DAG.getTargetConstant(C.ICmpType, DL, MVT::i32)); if (C.Opcode == SystemZISD::TM) { bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) != bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1)); return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1, - DAG.getConstant(RegisterOnly, DL, MVT::i32)); + DAG.getTargetConstant(RegisterOnly, DL, MVT::i32)); } return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1); } @@ -2576,10 +2592,10 @@ static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // in CCValid, so other values can be ignored. static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg, unsigned CCValid, unsigned CCMask) { - SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(CCValid, DL, MVT::i32), - DAG.getConstant(CCMask, DL, MVT::i32), CCReg }; + SDValue Ops[] = {DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getTargetConstant(CCValid, DL, MVT::i32), + DAG.getTargetConstant(CCMask, DL, MVT::i32), CCReg}; return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops); } @@ -2741,9 +2757,10 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const { Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL)); SDValue CCReg = emitCmp(DAG, DL, C); - return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(), - Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32), - DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg); + return DAG.getNode( + SystemZISD::BR_CCMASK, DL, Op.getValueType(), Op.getOperand(0), + DAG.getTargetConstant(C.CCValid, DL, MVT::i32), + DAG.getTargetConstant(C.CCMask, DL, MVT::i32), Dest, CCReg); } // Return true if Pos is CmpOp and Neg is the negative of CmpOp, @@ -2794,8 +2811,9 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op, } SDValue CCReg = emitCmp(DAG, DL, C); - SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32), - DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg}; + SDValue Ops[] = {TrueOp, FalseOp, + DAG.getTargetConstant(C.CCValid, DL, MVT::i32), + DAG.getTargetConstant(C.CCMask, DL, MVT::i32), CCReg}; return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops); } @@ -3882,11 +3900,8 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op, bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ; auto *Node = cast<MemIntrinsicSDNode>(Op.getNode()); - SDValue Ops[] = { - Op.getOperand(0), - DAG.getConstant(Code, DL, MVT::i32), - Op.getOperand(1) - }; + SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32), + Op.getOperand(1)}; return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL, Node->getVTList(), Ops, Node->getMemoryVT(), Node->getMemOperand()); @@ -4228,7 +4243,7 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL, Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1); SDValue Op; if (P.Opcode == SystemZISD::PERMUTE_DWORDS) { - SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32); + SDValue Op2 = DAG.getTargetConstant(P.Operand, DL, MVT::i32); Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2); } else if (P.Opcode == SystemZISD::PACK) { MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8), @@ -4253,7 +4268,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL, unsigned StartIndex, OpNo0, OpNo1; if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1)) return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0], - Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32)); + Ops[OpNo1], + DAG.getTargetConstant(StartIndex, DL, MVT::i32)); // Fall back on VPERM. Construct an SDNode for the permute vector. SDValue IndexNodes[SystemZ::VectorBytes]; @@ -4751,7 +4767,7 @@ SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index)); // Otherwise keep it as a vector-to-vector operation. return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0), - DAG.getConstant(Index, DL, MVT::i32)); + DAG.getTargetConstant(Index, DL, MVT::i32)); } GeneralShuffle GS(VT); @@ -6041,8 +6057,8 @@ SDValue SystemZTargetLowering::combineBR_CCMASK( if (combineCCMask(CCReg, CCValidVal, CCMaskVal)) return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0), Chain, - DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32), - DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32), + DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32), + DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32), N->getOperand(3), CCReg); return SDValue(); } @@ -6063,10 +6079,9 @@ SDValue SystemZTargetLowering::combineSELECT_CCMASK( if (combineCCMask(CCReg, CCValidVal, CCMaskVal)) return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0), - N->getOperand(0), - N->getOperand(1), - DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32), - DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32), + N->getOperand(0), N->getOperand(1), + DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32), + DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32), CCReg); return SDValue(); } @@ -6548,19 +6563,17 @@ static bool isSelectPseudo(MachineInstr &MI) { // Helper function, which inserts PHI functions into SinkMBB: // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ], -// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects -// in [MIItBegin, MIItEnd) range. -static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin, - MachineBasicBlock::iterator MIItEnd, +// where %FalseValue(i) and %TrueValue(i) are taken from Selects. +static void createPHIsForSelects(SmallVector<MachineInstr*, 8> &Selects, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB) { MachineFunction *MF = TrueMBB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - unsigned CCValid = MIItBegin->getOperand(3).getImm(); - unsigned CCMask = MIItBegin->getOperand(4).getImm(); - DebugLoc DL = MIItBegin->getDebugLoc(); + MachineInstr *FirstMI = Selects.front(); + unsigned CCValid = FirstMI->getOperand(3).getImm(); + unsigned CCMask = FirstMI->getOperand(4).getImm(); MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); @@ -6572,16 +6585,15 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin, // destination registers, and the registers that went into the PHI. DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; - for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; - MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned TrueReg = MIIt->getOperand(1).getReg(); - unsigned FalseReg = MIIt->getOperand(2).getReg(); + for (auto MI : Selects) { + Register DestReg = MI->getOperand(0).getReg(); + Register TrueReg = MI->getOperand(1).getReg(); + Register FalseReg = MI->getOperand(2).getReg(); // If this Select we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the // PHI that is going to be generated. - if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask)) + if (MI->getOperand(4).getImm() == (CCValid ^ CCMask)) std::swap(TrueReg, FalseReg); if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end()) @@ -6590,6 +6602,7 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin, if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end()) FalseReg = RegRewriteTable[FalseReg].second; + DebugLoc DL = MI->getDebugLoc(); BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg) .addReg(TrueReg).addMBB(TrueMBB) .addReg(FalseReg).addMBB(FalseMBB); @@ -6605,36 +6618,61 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock * SystemZTargetLowering::emitSelect(MachineInstr &MI, MachineBasicBlock *MBB) const { + assert(isSelectPseudo(MI) && "Bad call to emitSelect()"); const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); unsigned CCValid = MI.getOperand(3).getImm(); unsigned CCMask = MI.getOperand(4).getImm(); - DebugLoc DL = MI.getDebugLoc(); // If we have a sequence of Select* pseudo instructions using the // same condition code value, we want to expand all of them into // a single pair of basic blocks using the same condition. - MachineInstr *LastMI = &MI; - MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward( - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - - if (isSelectPseudo(MI)) - while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) && - NextMIIt->getOperand(3).getImm() == CCValid && - (NextMIIt->getOperand(4).getImm() == CCMask || - NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) { - LastMI = &*NextMIIt; - NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end()); + SmallVector<MachineInstr*, 8> Selects; + SmallVector<MachineInstr*, 8> DbgValues; + Selects.push_back(&MI); + unsigned Count = 0; + for (MachineBasicBlock::iterator NextMIIt = + std::next(MachineBasicBlock::iterator(MI)); + NextMIIt != MBB->end(); ++NextMIIt) { + if (NextMIIt->definesRegister(SystemZ::CC)) + break; + if (isSelectPseudo(*NextMIIt)) { + assert(NextMIIt->getOperand(3).getImm() == CCValid && + "Bad CCValid operands since CC was not redefined."); + if (NextMIIt->getOperand(4).getImm() == CCMask || + NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask)) { + Selects.push_back(&*NextMIIt); + continue; + } + break; } + bool User = false; + for (auto SelMI : Selects) + if (NextMIIt->readsVirtualRegister(SelMI->getOperand(0).getReg())) { + User = true; + break; + } + if (NextMIIt->isDebugInstr()) { + if (User) { + assert(NextMIIt->isDebugValue() && "Unhandled debug opcode."); + DbgValues.push_back(&*NextMIIt); + } + } + else if (User || ++Count > 20) + break; + } + MachineInstr *LastMI = Selects.back(); + bool CCKilled = + (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB)); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB); + MachineBasicBlock *JoinMBB = splitBlockAfter(LastMI, MBB); MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); // Unless CC was killed in the last Select instruction, mark it as // live-in to both FalseMBB and JoinMBB. - if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) { + if (!CCKilled) { FalseMBB->addLiveIn(SystemZ::CC); JoinMBB->addLiveIn(SystemZ::CC); } @@ -6643,7 +6681,7 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, // BRC CCMask, JoinMBB // # fallthrough to FalseMBB MBB = StartMBB; - BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::BRC)) .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB); MBB->addSuccessor(JoinMBB); MBB->addSuccessor(FalseMBB); @@ -6657,12 +6695,14 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, // %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ] // ... MBB = JoinMBB; - MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); - MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward( - std::next(MachineBasicBlock::iterator(LastMI)), MBB->end()); - createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB); + createPHIsForSelects(Selects, StartMBB, FalseMBB, MBB); + for (auto SelMI : Selects) + SelMI->eraseFromParent(); + + MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI(); + for (auto DbgMI : DbgValues) + MBB->splice(InsertPos, StartMBB, DbgMI); - StartMBB->erase(MIItBegin, MIItEnd); return JoinMBB; } @@ -6678,10 +6718,10 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); - unsigned SrcReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(0).getReg(); MachineOperand Base = MI.getOperand(1); int64_t Disp = MI.getOperand(2).getImm(); - unsigned IndexReg = MI.getOperand(3).getReg(); + Register IndexReg = MI.getOperand(3).getReg(); unsigned CCValid = MI.getOperand(4).getImm(); unsigned CCMask = MI.getOperand(5).getImm(); DebugLoc DL = MI.getDebugLoc(); @@ -6773,7 +6813,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( // Extract the operands. Base can be a register or a frame index. // Src2 can be a register or immediate. - unsigned Dest = MI.getOperand(0).getReg(); + Register Dest = MI.getOperand(0).getReg(); MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); MachineOperand Src2 = earlyUseOperand(MI.getOperand(3)); @@ -6833,7 +6873,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( .addReg(OldVal).addReg(BitShift).addImm(0); if (Invert) { // Perform the operation normally and then invert every bit of the field. - unsigned Tmp = MRI.createVirtualRegister(RC); + Register Tmp = MRI.createVirtualRegister(RC); BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2); if (BitSize <= 32) // XILF with the upper BitSize bits set. @@ -6842,7 +6882,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( else { // Use LCGR and add -1 to the result, which is more compact than // an XILF, XILH pair. - unsigned Tmp2 = MRI.createVirtualRegister(RC); + Register Tmp2 = MRI.createVirtualRegister(RC); BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp); BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal) .addReg(Tmp2).addImm(-1); @@ -6891,7 +6931,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( bool IsSubWord = (BitSize < 32); // Extract the operands. Base can be a register or a frame index. - unsigned Dest = MI.getOperand(0).getReg(); + Register Dest = MI.getOperand(0).getReg(); MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); Register Src2 = MI.getOperand(3).getReg(); @@ -7005,13 +7045,13 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI, MachineRegisterInfo &MRI = MF.getRegInfo(); // Extract the operands. Base can be a register or a frame index. - unsigned Dest = MI.getOperand(0).getReg(); + Register Dest = MI.getOperand(0).getReg(); MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); - unsigned OrigCmpVal = MI.getOperand(3).getReg(); - unsigned OrigSwapVal = MI.getOperand(4).getReg(); - unsigned BitShift = MI.getOperand(5).getReg(); - unsigned NegBitShift = MI.getOperand(6).getReg(); + Register OrigCmpVal = MI.getOperand(3).getReg(); + Register OrigSwapVal = MI.getOperand(4).getReg(); + Register BitShift = MI.getOperand(5).getReg(); + Register NegBitShift = MI.getOperand(6).getReg(); int64_t BitSize = MI.getOperand(7).getImm(); DebugLoc DL = MI.getDebugLoc(); @@ -7023,14 +7063,14 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI, assert(LOpcode && CSOpcode && "Displacement out of range"); // Create virtual registers for temporary results. - unsigned OrigOldVal = MRI.createVirtualRegister(RC); - unsigned OldVal = MRI.createVirtualRegister(RC); - unsigned CmpVal = MRI.createVirtualRegister(RC); - unsigned SwapVal = MRI.createVirtualRegister(RC); - unsigned StoreVal = MRI.createVirtualRegister(RC); - unsigned RetryOldVal = MRI.createVirtualRegister(RC); - unsigned RetryCmpVal = MRI.createVirtualRegister(RC); - unsigned RetrySwapVal = MRI.createVirtualRegister(RC); + Register OrigOldVal = MRI.createVirtualRegister(RC); + Register OldVal = MRI.createVirtualRegister(RC); + Register CmpVal = MRI.createVirtualRegister(RC); + Register SwapVal = MRI.createVirtualRegister(RC); + Register StoreVal = MRI.createVirtualRegister(RC); + Register RetryOldVal = MRI.createVirtualRegister(RC); + Register RetryCmpVal = MRI.createVirtualRegister(RC); + Register RetrySwapVal = MRI.createVirtualRegister(RC); // Insert 2 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; @@ -7129,11 +7169,11 @@ SystemZTargetLowering::emitPair128(MachineInstr &MI, MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Hi = MI.getOperand(1).getReg(); - unsigned Lo = MI.getOperand(2).getReg(); - unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); - unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); + Register Dest = MI.getOperand(0).getReg(); + Register Hi = MI.getOperand(1).getReg(); + Register Lo = MI.getOperand(2).getReg(); + Register Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); + Register Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1); BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2) @@ -7157,14 +7197,14 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI, MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); + Register Dest = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + Register In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128); if (ClearEven) { - unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); - unsigned Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass); + Register NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); + Register Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass); BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64) .addImm(0); @@ -7308,7 +7348,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( // The previous iteration might have created out-of-range displacements. // Apply them using LAY if so. if (!isUInt<12>(DestDisp)) { - unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) .add(DestBase) .addImm(DestDisp) @@ -7317,7 +7357,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( DestDisp = 0; } if (!isUInt<12>(SrcDisp)) { - unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) .add(SrcBase) .addImm(SrcDisp) @@ -7474,11 +7514,11 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0( static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); DebugLoc DL = MI.getDebugLoc(); - unsigned SrcReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(0).getReg(); // Create new virtual register of the same class as source. const TargetRegisterClass *RC = MRI->getRegClass(SrcReg); - unsigned DstReg = MRI->createVirtualRegister(RC); + Register DstReg = MRI->createVirtualRegister(RC); // Replace pseudo with a normal load-and-test that models the def as // well. diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index 19c7ec58ed3d..9c95e8aec940 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -25,10 +25,10 @@ let Predicates = [FeatureNoVectorEnhancements1] in let Predicates = [FeatureVectorEnhancements1] in def SelectVR128 : SelectWrapper<f128, VR128>; -defm CondStoreF32 : CondStores<FP32, nonvolatile_store, - nonvolatile_load, bdxaddr20only>; -defm CondStoreF64 : CondStores<FP64, nonvolatile_store, - nonvolatile_load, bdxaddr20only>; +defm CondStoreF32 : CondStores<FP32, simple_store, + simple_load, bdxaddr20only>; +defm CondStoreF64 : CondStores<FP64, simple_store, + simple_load, bdxaddr20only>; //===----------------------------------------------------------------------===// // Move instructions @@ -276,13 +276,13 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in { } // fp_to_sint always rounds towards zero, which is modifier value 5. -def : Pat<(i32 (fp_to_sint FP32:$src)), (CFEBR 5, FP32:$src)>; -def : Pat<(i32 (fp_to_sint FP64:$src)), (CFDBR 5, FP64:$src)>; -def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>; +def : Pat<(i32 (any_fp_to_sint FP32:$src)), (CFEBR 5, FP32:$src)>; +def : Pat<(i32 (any_fp_to_sint FP64:$src)), (CFDBR 5, FP64:$src)>; +def : Pat<(i32 (any_fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>; -def : Pat<(i64 (fp_to_sint FP32:$src)), (CGEBR 5, FP32:$src)>; -def : Pat<(i64 (fp_to_sint FP64:$src)), (CGDBR 5, FP64:$src)>; -def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>; +def : Pat<(i64 (any_fp_to_sint FP32:$src)), (CGEBR 5, FP32:$src)>; +def : Pat<(i64 (any_fp_to_sint FP64:$src)), (CGDBR 5, FP64:$src)>; +def : Pat<(i64 (any_fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>; // The FP extension feature provides versions of the above that allow // also specifying the inexact-exception suppression flag. @@ -309,13 +309,13 @@ let Predicates = [FeatureFPExtension] in { def CLGXBR : TernaryRRFe<"clgxbr", 0xB3AE, GR64, FP128>; } - def : Pat<(i32 (fp_to_uint FP32:$src)), (CLFEBR 5, FP32:$src, 0)>; - def : Pat<(i32 (fp_to_uint FP64:$src)), (CLFDBR 5, FP64:$src, 0)>; - def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>; + def : Pat<(i32 (any_fp_to_uint FP32:$src)), (CLFEBR 5, FP32:$src, 0)>; + def : Pat<(i32 (any_fp_to_uint FP64:$src)), (CLFDBR 5, FP64:$src, 0)>; + def : Pat<(i32 (any_fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>; - def : Pat<(i64 (fp_to_uint FP32:$src)), (CLGEBR 5, FP32:$src, 0)>; - def : Pat<(i64 (fp_to_uint FP64:$src)), (CLGDBR 5, FP64:$src, 0)>; - def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>; + def : Pat<(i64 (any_fp_to_uint FP32:$src)), (CLGEBR 5, FP32:$src, 0)>; + def : Pat<(i64 (any_fp_to_uint FP64:$src)), (CLGDBR 5, FP64:$src, 0)>; + def : Pat<(i64 (any_fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>; } diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 2a1d14de3ddf..c9dbe3da686d 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2141,17 +2141,17 @@ class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode, } class CmpBranchRIEa<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3), mnemonic#"$M3\t$R1, $I2", []>; class AsmCmpBranchRIEa<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3), mnemonic#"\t$R1, $I2, $M3", []>; class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2), mnemonic#V.suffix#"\t$R1, $I2", []> { let isAsmParserOnly = V.alternate; @@ -2159,7 +2159,7 @@ class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode, } multiclass CmpBranchRIEaPair<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> { + RegisterOperand cls, ImmOpWithPattern imm> { let isCodeGenOnly = 1 in def "" : CmpBranchRIEa<mnemonic, opcode, cls, imm>; def Asm : AsmCmpBranchRIEa<mnemonic, opcode, cls, imm>; @@ -2193,19 +2193,19 @@ multiclass CmpBranchRIEbPair<string mnemonic, bits<16> opcode, } class CmpBranchRIEc<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3, brtarget16:$RI4), mnemonic#"$M3\t$R1, $I2, $RI4", []>; class AsmCmpBranchRIEc<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3, brtarget16:$RI4), mnemonic#"\t$R1, $I2, $M3, $RI4", []>; class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, brtarget16:$RI4), mnemonic#V.suffix#"\t$R1, $I2, $RI4", []> { let isAsmParserOnly = V.alternate; @@ -2213,7 +2213,7 @@ class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode, } multiclass CmpBranchRIEcPair<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> { + RegisterOperand cls, ImmOpWithPattern imm> { let isCodeGenOnly = 1 in def "" : CmpBranchRIEc<mnemonic, opcode, cls, imm>; def Asm : AsmCmpBranchRIEc<mnemonic, opcode, cls, imm>; @@ -2272,19 +2272,19 @@ multiclass CmpBranchRRSPair<string mnemonic, bits<16> opcode, } class CmpBranchRIS<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3, bdaddr12only:$BD4), mnemonic#"$M3\t$R1, $I2, $BD4", []>; class AsmCmpBranchRIS<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3, bdaddr12only:$BD4), mnemonic#"\t$R1, $I2, $M3, $BD4", []>; class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, bdaddr12only:$BD4), mnemonic#V.suffix#"\t$R1, $I2, $BD4", []> { let isAsmParserOnly = V.alternate; @@ -2292,7 +2292,7 @@ class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode, } multiclass CmpBranchRISPair<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> { + RegisterOperand cls, ImmOpWithPattern imm> { let isCodeGenOnly = 1 in def "" : CmpBranchRIS<mnemonic, opcode, cls, imm>; def Asm : AsmCmpBranchRIS<mnemonic, opcode, cls, imm>; @@ -2585,7 +2585,7 @@ multiclass StoreMultipleVRSaAlign<string mnemonic, bits<16> opcode> { // We therefore match the address in the same way as a normal store and // only use the StoreSI* instruction if the matched address is suitable. class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator, - Immediate imm> + ImmOpWithPattern imm> : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(operator imm:$I2, mviaddr12pair:$BD1)]> { @@ -2593,7 +2593,7 @@ class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator, } class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator, - Immediate imm> + ImmOpWithPattern imm> : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(operator imm:$I2, mviaddr20pair:$BD1)]> { @@ -2601,7 +2601,7 @@ class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator, } class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator, - Immediate imm> + ImmOpWithPattern imm> : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(operator imm:$I2, mviaddr12pair:$BD1)]> { @@ -2609,7 +2609,7 @@ class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator, } multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode, - SDPatternOperator operator, Immediate imm> { + SDPatternOperator operator, ImmOpWithPattern imm> { let DispKey = mnemonic in { let DispSize = "12" in def "" : StoreSI<mnemonic, siOpcode, operator, imm>; @@ -2665,7 +2665,7 @@ multiclass CondStoreRSYPair<string mnemonic, bits<16> opcode, def Asm : AsmCondStoreRSY<mnemonic, opcode, cls, bytes, mode>; } -class SideEffectUnaryI<string mnemonic, bits<8> opcode, Immediate imm> +class SideEffectUnaryI<string mnemonic, bits<8> opcode, ImmOpWithPattern imm> : InstI<opcode, (outs), (ins imm:$I1), mnemonic#"\t$I1", []>; @@ -2761,13 +2761,13 @@ class UnaryMemRRFc<string mnemonic, bits<16> opcode, } class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2), mnemonic#"\t$R1, $I2", [(set cls:$R1, (operator imm:$I2))]>; class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRILa<opcode, (outs cls:$R1), (ins imm:$I2), mnemonic#"\t$R1, $I2", [(set cls:$R1, (operator imm:$I2))]>; @@ -2885,14 +2885,14 @@ multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode, } class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator, - TypedReg tr, Immediate imm, bits<4> type = 0> + TypedReg tr, ImmOpWithPattern imm, bits<4> type = 0> : InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2), mnemonic#"\t$V1, $I2", - [(set (tr.vt tr.op:$V1), (operator imm:$I2))]> { + [(set (tr.vt tr.op:$V1), (operator (i32 timm:$I2)))]> { let M3 = type; } -class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, Immediate imm> +class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, ImmOpWithPattern imm> : InstVRIa<opcode, (outs VR128:$V1), (ins imm:$I2, imm32zx4:$M3), mnemonic#"\t$V1, $I2, $M3", []>; @@ -3021,7 +3021,7 @@ class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode, } class SideEffectBinaryIE<string mnemonic, bits<16> opcode, - Immediate imm1, Immediate imm2> + ImmOpWithPattern imm1, ImmOpWithPattern imm2> : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2), mnemonic#"\t$I1, $I2", []>; @@ -3030,7 +3030,7 @@ class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm> mnemonic#"\t$BD1, $I2", []>; class SideEffectBinarySIL<string mnemonic, bits<16> opcode, - SDPatternOperator operator, Immediate imm> + SDPatternOperator operator, ImmOpWithPattern imm> : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>; @@ -3165,7 +3165,7 @@ class BinaryRRFc<string mnemonic, bits<16> opcode, mnemonic#"\t$R1, $R2, $M3", []>; class BinaryMemRRFc<string mnemonic, bits<16> opcode, - RegisterOperand cls1, RegisterOperand cls2, Immediate imm> + RegisterOperand cls1, RegisterOperand cls2, ImmOpWithPattern imm> : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []> { let Constraints = "$R1 = $R1src"; @@ -3267,7 +3267,7 @@ multiclass CondBinaryRRFaPair<string mnemonic, bits<16> opcode, } class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2), mnemonic#"\t$R1, $I2", [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { @@ -3276,14 +3276,14 @@ class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, } class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2), mnemonic#"\t$R1, $R3, $I2", [(set cls:$R1, (operator cls:$R3, imm:$I2))]>; multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2, SDPatternOperator operator, RegisterOperand cls, - Immediate imm> { + ImmOpWithPattern imm> { let NumOpsKey = mnemonic in { let NumOpsValue = "3" in def K : BinaryRIE<mnemonic##"k", opcode2, operator, cls, imm>, @@ -3294,7 +3294,7 @@ multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2, } class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3), mnemonic#"$M3\t$R1, $I2", @@ -3308,7 +3308,7 @@ class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls, // Like CondBinaryRIE, but used for the raw assembly form. The condition-code // mask is the third operand rather than being part of the mnemonic. class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2, imm32zx4:$M3), mnemonic#"\t$R1, $I2, $M3", []> { @@ -3318,7 +3318,7 @@ class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls, // Like CondBinaryRIE, but with a fixed CC mask. class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2), mnemonic#V.suffix#"\t$R1, $I2", []> { let Constraints = "$R1 = $R1src"; @@ -3328,14 +3328,14 @@ class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode, } multiclass CondBinaryRIEPair<string mnemonic, bits<16> opcode, - RegisterOperand cls, Immediate imm> { + RegisterOperand cls, ImmOpWithPattern imm> { let isCodeGenOnly = 1 in def "" : CondBinaryRIE<mnemonic, opcode, cls, imm>; def Asm : AsmCondBinaryRIE<mnemonic, opcode, cls, imm>; } class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRILa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2), mnemonic#"\t$R1, $I2", [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { @@ -3484,7 +3484,7 @@ class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr, bits<4> type> : InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3), mnemonic#"\t$V1, $I2, $I3", - [(set (tr.vt tr.op:$V1), (operator imm32zx8:$I2, imm32zx8:$I3))]> { + [(set (tr.vt tr.op:$V1), (operator imm32zx8_timm:$I2, imm32zx8_timm:$I3))]> { let M4 = type; } @@ -3498,7 +3498,7 @@ class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator, : InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2), mnemonic#"\t$V1, $V3, $I2", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3), - imm32zx16:$I2))]> { + imm32zx16_timm:$I2))]> { let M4 = type; } @@ -3512,7 +3512,7 @@ class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator, : InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3), mnemonic#"\t$V1, $V2, $I3", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2), - imm32zx12:$I3))]> { + imm32zx12_timm:$I3))]> { let M4 = type; let M5 = m5; } @@ -3715,7 +3715,7 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator, : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3), mnemonic#"\t$V1, $XBD2, $M3", [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2, - imm32zx4:$M3))]> { + imm32zx4_timm:$M3))]> { let mayLoad = 1; let AccessBytes = bytes; } @@ -3765,7 +3765,7 @@ class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator, } class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, - Immediate index> + ImmOpWithPattern index> : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3), mnemonic#"\t$V1, $VBD2, $M3", []> { let mayStore = 1; @@ -3774,7 +3774,7 @@ class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, class StoreBinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr, bits<5> bytes, - Immediate index> + ImmOpWithPattern index> : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2, index:$M3), mnemonic#"\t$V1, $XBD2, $M3", [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2, index:$M3)]> { @@ -3809,7 +3809,7 @@ class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator, } class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2), mnemonic#"\t$R1, $I2", [(set CC, (operator cls:$R1, imm:$I2))]> { @@ -3817,7 +3817,7 @@ class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, } class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> + RegisterOperand cls, ImmOpWithPattern imm> : InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2), mnemonic#"\t$R1, $I2", [(set CC, (operator cls:$R1, imm:$I2))]> { @@ -3924,7 +3924,7 @@ class CompareSSb<string mnemonic, bits<8> opcode> } class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator, - SDPatternOperator load, Immediate imm, + SDPatternOperator load, ImmOpWithPattern imm, AddressingMode mode = bdaddr12only> : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", @@ -3934,7 +3934,7 @@ class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator, } class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator, - SDPatternOperator load, Immediate imm> + SDPatternOperator load, ImmOpWithPattern imm> : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(set CC, (operator (load bdaddr12only:$BD1), imm:$I2))]> { @@ -3943,7 +3943,7 @@ class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator, } class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator, - SDPatternOperator load, Immediate imm, + SDPatternOperator load, ImmOpWithPattern imm, AddressingMode mode = bdaddr20only> : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", @@ -3954,7 +3954,7 @@ class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator, multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode, SDPatternOperator operator, SDPatternOperator load, - Immediate imm> { + ImmOpWithPattern imm> { let DispKey = mnemonic in { let DispSize = "12" in def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>; @@ -4012,7 +4012,7 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator, } class TestBinarySIL<string mnemonic, bits<16> opcode, - SDPatternOperator operator, Immediate imm> + SDPatternOperator operator, ImmOpWithPattern imm> : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2), mnemonic#"\t$BD1, $I2", [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>; @@ -4073,7 +4073,7 @@ class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode, class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2, - Immediate imm> + ImmOpWithPattern imm> : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []>; @@ -4086,7 +4086,7 @@ multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode, class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2, - Immediate imm> + ImmOpWithPattern imm> : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []> { @@ -4221,7 +4221,7 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator, } class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator, - TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index> + TypedReg tr1, TypedReg tr2, ImmOpWithPattern imm, ImmOpWithPattern index> : InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3), mnemonic#"\t$V1, $I2, $M3", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src), @@ -4237,7 +4237,7 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator, mnemonic#"\t$V1, $V2, $V3, $I4", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3), - imm32zx8:$I4))]> { + imm32zx8_timm:$I4))]> { let M5 = type; } @@ -4252,8 +4252,8 @@ class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator, (ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5), mnemonic#"\t$V1, $V2, $M4, $M5", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2), - imm32zx4:$M4, - imm32zx4:$M5))], + imm32zx4_timm:$M4, + imm32zx4_timm:$M5))], m4or> { let M3 = type; } @@ -4285,13 +4285,13 @@ multiclass TernaryOptVRRbSPair<string mnemonic, bits<16> opcode, TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> modifier = 0> { def "" : TernaryVRRb<mnemonic, opcode, operator, tr1, tr2, type, - imm32zx4even, !and (modifier, 14)>; + imm32zx4even_timm, !and (modifier, 14)>; def : InstAlias<mnemonic#"\t$V1, $V2, $V3", (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2, tr2.op:$V3, 0)>; let Defs = [CC] in def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type, - imm32zx4even, !add(!and (modifier, 14), 1)>; + imm32zx4even_timm, !add(!and (modifier, 14), 1)>; def : InstAlias<mnemonic#"s\t$V1, $V2, $V3", (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2, tr2.op:$V3, 0)>; @@ -4314,7 +4314,7 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator, mnemonic#"\t$V1, $V2, $V3, $M4", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3), - imm32zx4:$M4))]> { + imm32zx4_timm:$M4))]> { let M5 = 0; let M6 = 0; } @@ -4327,7 +4327,7 @@ class TernaryVRRcFloat<string mnemonic, bits<16> opcode, mnemonic#"\t$V1, $V2, $V3, $M6", [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3), - imm32zx4:$M6))]> { + imm32zx4_timm:$M6))]> { let M4 = type; let M5 = m5; } @@ -4429,7 +4429,7 @@ class TernaryVRSbGeneric<string mnemonic, bits<16> opcode> } class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, - Immediate index> + ImmOpWithPattern index> : InstVRV<opcode, (outs VR128:$V1), (ins VR128:$V1src, bdvaddr12only:$VBD2, index:$M3), mnemonic#"\t$V1, $VBD2, $M3", []> { @@ -4440,7 +4440,7 @@ class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, } class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator, - TypedReg tr1, TypedReg tr2, bits<5> bytes, Immediate index> + TypedReg tr1, TypedReg tr2, bits<5> bytes, ImmOpWithPattern index> : InstVRX<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3), mnemonic#"\t$V1, $XBD2, $M3", @@ -4461,7 +4461,7 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src), (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3), - imm32zx8:$I4))]> { + imm32zx8_timm:$I4))]> { let Constraints = "$V1 = $V1src"; let DisableEncoding = "$V1src"; let M5 = type; @@ -4480,7 +4480,7 @@ class QuaternaryVRIf<string mnemonic, bits<16> opcode> : InstVRIf<opcode, (outs VR128:$V1), (ins VR128:$V2, VR128:$V3, imm32zx8:$I4, imm32zx4:$M5), - mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>; + mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>; class QuaternaryVRIg<string mnemonic, bits<16> opcode> : InstVRIg<opcode, (outs VR128:$V1), @@ -4491,7 +4491,7 @@ class QuaternaryVRIg<string mnemonic, bits<16> opcode> class QuaternaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, TypedReg tr3, TypedReg tr4, bits<4> type, - SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0> + SDPatternOperator m6mask = imm32zx4_timm, bits<4> m6or = 0> : InstVRRd<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6), mnemonic#"\t$V1, $V2, $V3, $V4, $M6", @@ -4518,14 +4518,14 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode, bits<4> modifier = 0> { def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, tr2, tr2, type, - imm32zx4even, !and (modifier, 14)>; + imm32zx4even_timm, !and (modifier, 14)>; def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4", (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, 0)>; let Defs = [CC] in def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, tr2, tr2, type, - imm32zx4even, !add (!and (modifier, 14), 1)>; + imm32zx4even_timm, !add (!and (modifier, 14), 1)>; def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4", (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, 0)>; @@ -4536,7 +4536,7 @@ multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> { def "" : QuaternaryVRRdGeneric<mnemonic, opcode>; def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5", (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3, - VR128:$V4, imm32zx4:$M5, 0)>; + VR128:$V4, imm32zx4_timm:$M5, 0)>; } class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode, @@ -4638,13 +4638,13 @@ class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1, class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator> : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2), mnemonic##"\t$M1, $XBD2", - [(operator imm32zx4:$M1, bdxaddr20only:$XBD2)]>; + [(operator imm32zx4_timm:$M1, bdxaddr20only:$XBD2)]>; class PrefetchRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator> - : InstRILc<opcode, (outs), (ins imm32zx4:$M1, pcrel32:$RI2), + : InstRILc<opcode, (outs), (ins imm32zx4_timm:$M1, pcrel32:$RI2), mnemonic##"\t$M1, $RI2", - [(operator imm32zx4:$M1, pcrel32:$RI2)]> { + [(operator imm32zx4_timm:$M1, pcrel32:$RI2)]> { // We want PC-relative addresses to be tried ahead of BD and BDX addresses. // However, BDXs have two extra operands and are therefore 6 units more // complex. @@ -4691,7 +4691,7 @@ class Pseudo<dag outs, dag ins, list<dag> pattern> // Like UnaryRI, but expanded after RA depending on the choice of register. class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Pseudo<(outs cls:$R1), (ins imm:$I2), [(set cls:$R1, (operator imm:$I2))]>; @@ -4720,7 +4720,7 @@ class UnaryRRPseudo<string key, SDPatternOperator operator, // Like BinaryRI, but expanded after RA depending on the choice of register. class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2), [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { let Constraints = "$R1 = $R1src"; @@ -4728,13 +4728,13 @@ class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls, // Like BinaryRIE, but expanded after RA depending on the choice of register. class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2), [(set cls:$R1, (operator cls:$R3, imm:$I2))]>; // Like BinaryRIAndK, but expanded after RA depending on the choice of register. multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator, - RegisterOperand cls, Immediate imm> { + RegisterOperand cls, ImmOpWithPattern imm> { let NumOpsKey = key in { let NumOpsValue = "3" in def K : BinaryRIEPseudo<operator, cls, imm>, @@ -4764,7 +4764,7 @@ class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes, // Like CompareRI, but expanded after RA depending on the choice of register. class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(set CC, (operator cls:$R1, imm:$I2))]> { let isCompare = 1; @@ -4783,7 +4783,7 @@ class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls, } // Like TestBinarySIL, but expanded later. -class TestBinarySILPseudo<SDPatternOperator operator, Immediate imm> +class TestBinarySILPseudo<SDPatternOperator operator, ImmOpWithPattern imm> : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2), [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>; @@ -4812,7 +4812,7 @@ class CondBinaryRRFaPseudo<RegisterOperand cls1, RegisterOperand cls2, // Like CondBinaryRIE, but expanded after RA depending on the choice of // register. -class CondBinaryRIEPseudo<RegisterOperand cls, Immediate imm> +class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm> : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3), [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src, @@ -4876,7 +4876,7 @@ class SelectWrapper<ValueType vt, RegisterOperand cls> : Pseudo<(outs cls:$dst), (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc), [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2, - imm32zx4:$valid, imm32zx4:$cc))]> { + imm32zx4_timm:$valid, imm32zx4_timm:$cc))]> { let usesCustomInserter = 1; let hasNoSchedulingInfo = 1; let Uses = [CC]; @@ -4890,12 +4890,12 @@ multiclass CondStores<RegisterOperand cls, SDPatternOperator store, def "" : Pseudo<(outs), (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask cls:$new, (load mode:$addr), - imm32zx4:$valid, imm32zx4:$cc), + imm32zx4_timm:$valid, imm32zx4_timm:$cc), mode:$addr)]>; def Inv : Pseudo<(outs), (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask (load mode:$addr), cls:$new, - imm32zx4:$valid, imm32zx4:$cc), + imm32zx4_timm:$valid, imm32zx4_timm:$cc), mode:$addr)]>; } } @@ -4917,11 +4917,11 @@ class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls, // Specializations of AtomicLoadWBinary. class AtomicLoadBinaryReg32<SDPatternOperator operator> : AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>; -class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm> +class AtomicLoadBinaryImm32<SDPatternOperator operator, ImmOpWithPattern imm> : AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>; class AtomicLoadBinaryReg64<SDPatternOperator operator> : AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>; -class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm> +class AtomicLoadBinaryImm64<SDPatternOperator operator, ImmOpWithPattern imm> : AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>; // OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation. PAT and OPERAND @@ -4944,7 +4944,7 @@ class AtomicLoadWBinary<SDPatternOperator operator, dag pat, // Specializations of AtomicLoadWBinary. class AtomicLoadWBinaryReg<SDPatternOperator operator> : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>; -class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm> +class AtomicLoadWBinaryImm<SDPatternOperator operator, ImmOpWithPattern imm> : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>; // A pseudo instruction that is a direct alias of a real instruction. @@ -4979,7 +4979,7 @@ class StoreAliasVRX<SDPatternOperator operator, TypedReg tr, // An alias of a BinaryRI, but with different register sizes. class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2), [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { let Constraints = "$R1 = $R1src"; @@ -4987,7 +4987,7 @@ class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls, // An alias of a BinaryRIL, but with different register sizes. class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2), [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { let Constraints = "$R1 = $R1src"; @@ -4999,7 +4999,7 @@ class BinaryAliasVRRf<RegisterOperand cls> // An alias of a CompareRI, but with different register sizes. class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls, - Immediate imm> + ImmOpWithPattern imm> : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(set CC, (operator cls:$R1, imm:$I2))]> { let isCompare = 1; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 57c1cf4ec70a..bc783608d45b 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -46,22 +46,12 @@ using namespace llvm; #include "SystemZGenInstrInfo.inc" #define DEBUG_TYPE "systemz-II" -STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)"); // Return a mask with Count low bits set. static uint64_t allOnes(unsigned int Count) { return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1; } -// Reg should be a 32-bit GPR. Return true if it is a high register rather -// than a low register. -static bool isHighReg(unsigned int Reg) { - if (SystemZ::GRH32BitRegClass.contains(Reg)) - return true; - assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32"); - return false; -} - // Pin the vtable to this file. void SystemZInstrInfo::anchor() {} @@ -85,7 +75,7 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI, // Set up the two 64-bit registers and remember super reg and its flags. MachineOperand &HighRegOp = EarlierMI->getOperand(0); MachineOperand &LowRegOp = MI->getOperand(0); - unsigned Reg128 = LowRegOp.getReg(); + Register Reg128 = LowRegOp.getReg(); unsigned Reg128Killed = getKillRegState(LowRegOp.isKill()); unsigned Reg128Undef = getUndefRegState(LowRegOp.isUndef()); HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64)); @@ -147,8 +137,8 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const { void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode, bool ConvertHigh) const { - unsigned Reg = MI.getOperand(0).getReg(); - bool IsHigh = isHighReg(Reg); + Register Reg = MI.getOperand(0).getReg(); + bool IsHigh = SystemZ::isHighReg(Reg); MI.setDesc(get(IsHigh ? HighOpcode : LowOpcode)); if (IsHigh && ConvertHigh) MI.getOperand(1).setImm(uint32_t(MI.getOperand(1).getImm())); @@ -161,10 +151,10 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode, void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned LowOpcodeK, unsigned HighOpcode) const { - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); - bool DestIsHigh = isHighReg(DestReg); - bool SrcIsHigh = isHighReg(SrcReg); + Register DestReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + bool DestIsHigh = SystemZ::isHighReg(DestReg); + bool SrcIsHigh = SystemZ::isHighReg(SrcReg); if (!DestIsHigh && !SrcIsHigh) MI.setDesc(get(LowOpcodeK)); else { @@ -184,9 +174,10 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode, // is a high GR32. void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode) const { - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode, - MI.getOperand(2).getImm()); + Register Reg = MI.getOperand(0).getReg(); + unsigned Opcode = getOpcodeForOffset( + SystemZ::isHighReg(Reg) ? HighOpcode : LowOpcode, + MI.getOperand(2).getImm()); MI.setDesc(get(Opcode)); } @@ -195,93 +186,11 @@ void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode, // register is a low GR32 and HighOpcode if the register is a high GR32. void SystemZInstrInfo::expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode) const { - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Opcode = isHighReg(Reg) ? HighOpcode : LowOpcode; + Register Reg = MI.getOperand(0).getReg(); + unsigned Opcode = SystemZ::isHighReg(Reg) ? HighOpcode : LowOpcode; MI.setDesc(get(Opcode)); } -// MI is a load-register-on-condition pseudo instruction. Replace it with -// LowOpcode if source and destination are both low GR32s and HighOpcode if -// source and destination are both high GR32s. -void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode, - unsigned HighOpcode) const { - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(2).getReg(); - bool DestIsHigh = isHighReg(DestReg); - bool SrcIsHigh = isHighReg(SrcReg); - - if (!DestIsHigh && !SrcIsHigh) - MI.setDesc(get(LowOpcode)); - else if (DestIsHigh && SrcIsHigh) - MI.setDesc(get(HighOpcode)); - else - LOCRMuxJumps++; - - // If we were unable to implement the pseudo with a single instruction, we - // need to convert it back into a branch sequence. This cannot be done here - // since the caller of expandPostRAPseudo does not handle changes to the CFG - // correctly. This change is defered to the SystemZExpandPseudo pass. -} - -// MI is a select pseudo instruction. Replace it with LowOpcode if source -// and destination are all low GR32s and HighOpcode if source and destination -// are all high GR32s. Otherwise, use the two-operand MixedOpcode. -void SystemZInstrInfo::expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode, - unsigned HighOpcode, - unsigned MixedOpcode) const { - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned Src1Reg = MI.getOperand(1).getReg(); - unsigned Src2Reg = MI.getOperand(2).getReg(); - bool DestIsHigh = isHighReg(DestReg); - bool Src1IsHigh = isHighReg(Src1Reg); - bool Src2IsHigh = isHighReg(Src2Reg); - - // If sources and destination aren't all high or all low, we may be able to - // simplify the operation by moving one of the sources to the destination - // first. But only if this doesn't clobber the other source. - if (DestReg != Src1Reg && DestReg != Src2Reg) { - if (DestIsHigh != Src1IsHigh) { - emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src1Reg, - SystemZ::LR, 32, MI.getOperand(1).isKill(), - MI.getOperand(1).isUndef()); - MI.getOperand(1).setReg(DestReg); - Src1Reg = DestReg; - Src1IsHigh = DestIsHigh; - } else if (DestIsHigh != Src2IsHigh) { - emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src2Reg, - SystemZ::LR, 32, MI.getOperand(2).isKill(), - MI.getOperand(2).isUndef()); - MI.getOperand(2).setReg(DestReg); - Src2Reg = DestReg; - Src2IsHigh = DestIsHigh; - } - } - - // If the destination (now) matches one source, prefer this to be first. - if (DestReg != Src1Reg && DestReg == Src2Reg) { - commuteInstruction(MI, false, 1, 2); - std::swap(Src1Reg, Src2Reg); - std::swap(Src1IsHigh, Src2IsHigh); - } - - if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh) - MI.setDesc(get(LowOpcode)); - else if (DestIsHigh && Src1IsHigh && Src2IsHigh) - MI.setDesc(get(HighOpcode)); - else { - // Given the simplifcation above, we must already have a two-operand case. - assert (DestReg == Src1Reg); - MI.setDesc(get(MixedOpcode)); - MI.tieOperands(0, 1); - LOCRMuxJumps++; - } - - // If we were unable to implement the pseudo with a single instruction, we - // need to convert it back into a branch sequence. This cannot be done here - // since the caller of expandPostRAPseudo does not handle changes to the CFG - // correctly. This change is defered to the SystemZExpandPseudo pass. -} - // MI is an RR-style pseudo instruction that zero-extends the low Size bits // of one GRX32 into another. Replace it with LowOpcode if both operands // are low registers, otherwise use RISB[LH]G. @@ -302,8 +211,8 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode, void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction &MF = *MBB->getParent(); - const unsigned Reg64 = MI->getOperand(0).getReg(); - const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32); + const Register Reg64 = MI->getOperand(0).getReg(); + const Register Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32); // EAR can only load the low subregister so us a shift for %a0 to produce // the GR containing %a0 and %a1. @@ -341,8 +250,8 @@ SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB, unsigned Size, bool KillSrc, bool UndefSrc) const { unsigned Opcode; - bool DestIsHigh = isHighReg(DestReg); - bool SrcIsHigh = isHighReg(SrcReg); + bool DestIsHigh = SystemZ::isHighReg(DestReg); + bool SrcIsHigh = SystemZ::isHighReg(SrcReg); if (DestIsHigh && SrcIsHigh) Opcode = SystemZ::RISBHH; else if (DestIsHigh && !SrcIsHigh) @@ -468,7 +377,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // Can't handle indirect branches. SystemZII::Branch Branch(getBranchInfo(*I)); - if (!Branch.Target->isMBB()) + if (!Branch.hasMBBTarget()) return true; // Punt on compound branches. @@ -478,7 +387,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (Branch.CCMask == SystemZ::CCMASK_ANY) { // Handle unconditional branches. if (!AllowModify) { - TBB = Branch.Target->getMBB(); + TBB = Branch.getMBBTarget(); continue; } @@ -490,7 +399,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB, FBB = nullptr; // Delete the JMP if it's equivalent to a fall-through. - if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) { + if (MBB.isLayoutSuccessor(Branch.getMBBTarget())) { TBB = nullptr; I->eraseFromParent(); I = MBB.end(); @@ -498,7 +407,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB, } // TBB is used to indicate the unconditinal destination. - TBB = Branch.Target->getMBB(); + TBB = Branch.getMBBTarget(); continue; } @@ -506,7 +415,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (Cond.empty()) { // FIXME: add X86-style branch swap FBB = TBB; - TBB = Branch.Target->getMBB(); + TBB = Branch.getMBBTarget(); Cond.push_back(MachineOperand::CreateImm(Branch.CCValid)); Cond.push_back(MachineOperand::CreateImm(Branch.CCMask)); continue; @@ -517,7 +426,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // Only handle the case where all conditional branches branch to the same // destination. - if (TBB != Branch.Target->getMBB()) + if (TBB != Branch.getMBBTarget()) return true; // If the conditions are the same, we can leave them alone. @@ -547,7 +456,7 @@ unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB, continue; if (!I->isBranch()) break; - if (!getBranchInfo(*I).Target->isMBB()) + if (!getBranchInfo(*I).hasMBBTarget()) break; // Remove the branch. I->eraseFromParent(); @@ -676,8 +585,8 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB, else { Opc = SystemZ::LOCR; MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass); - unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); + Register TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); + Register FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg); BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg); TrueReg = TReg; @@ -1258,13 +1167,14 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( assert(NumOps == 3 && "Expected two source registers."); Register DstReg = MI.getOperand(0).getReg(); Register DstPhys = - (TRI->isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg); + (Register::isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg); Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg() : ((OpNum == 1 && MI.isCommutable()) ? MI.getOperand(2).getReg() : Register())); if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg && - TRI->isVirtualRegister(SrcReg) && DstPhys == VRM->getPhys(SrcReg)) + Register::isVirtualRegister(SrcReg) && + DstPhys == VRM->getPhys(SrcReg)) NeedsCommute = (OpNum == 1); else MemOpcode = -1; @@ -1358,15 +1268,6 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { expandLOCPseudo(MI, SystemZ::LOCHI, SystemZ::LOCHHI); return true; - case SystemZ::LOCRMux: - expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR); - return true; - - case SystemZ::SELRMux: - expandSELRPseudo(MI, SystemZ::SELR, SystemZ::SELFHR, - SystemZ::LOCRMux); - return true; - case SystemZ::STCMux: expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH); return true; @@ -1468,8 +1369,8 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; case SystemZ::RISBMux: { - bool DestIsHigh = isHighReg(MI.getOperand(0).getReg()); - bool SrcIsHigh = isHighReg(MI.getOperand(2).getReg()); + bool DestIsHigh = SystemZ::isHighReg(MI.getOperand(0).getReg()); + bool SrcIsHigh = SystemZ::isHighReg(MI.getOperand(2).getReg()); if (SrcIsHigh == DestIsHigh) MI.setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL)); else { @@ -1545,6 +1446,10 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const { return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP, MI.getOperand(2).getImm(), &MI.getOperand(3)); + case SystemZ::INLINEASM_BR: + // Don't try to analyze asm goto, so pass nullptr as branch target argument. + return SystemZII::Branch(SystemZII::AsmGoto, 0, 0, nullptr); + default: llvm_unreachable("Unrecognized branch opcode"); } @@ -1845,8 +1750,7 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB, bool SystemZInstrInfo:: areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA) const { + const MachineInstr &MIb) const { if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) return false; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index 2edde175542e..6dc6e72aa52a 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -100,11 +100,18 @@ enum BranchType { // An instruction that decrements a 64-bit register and branches if // the result is nonzero. - BranchCTG + BranchCTG, + + // An instruction representing an asm goto statement. + AsmGoto }; // Information about a branch instruction. -struct Branch { +class Branch { + // The target of the branch. In case of INLINEASM_BR, this is nullptr. + const MachineOperand *Target; + +public: // The type of the branch. BranchType Type; @@ -114,12 +121,15 @@ struct Branch { // CCMASK_<N> is set if the branch should be taken when CC == N. unsigned CCMask; - // The target of the branch. - const MachineOperand *Target; - Branch(BranchType type, unsigned ccValid, unsigned ccMask, const MachineOperand *target) - : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {} + : Target(target), Type(type), CCValid(ccValid), CCMask(ccMask) {} + + bool isIndirect() { return Target != nullptr && Target->isReg(); } + bool hasMBBTarget() { return Target != nullptr && Target->isMBB(); } + MachineBasicBlock *getMBBTarget() { + return hasMBBTarget() ? Target->getMBB() : nullptr; + } }; // Kinds of fused compares in compare-and-* instructions. Together with type @@ -160,10 +170,6 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { unsigned HighOpcode) const; void expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode) const; - void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode, - unsigned HighOpcode) const; - void expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode, - unsigned HighOpcode, unsigned MixedOpcode) const; void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned Size) const; void expandLoadStackGuard(MachineInstr *MI) const; @@ -322,8 +328,7 @@ public: // memory addresses and false otherwise. bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 91856893e3bd..8b334756611a 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -337,15 +337,15 @@ defm CondStore8Mux : CondStores<GRX32, nonvolatile_truncstorei8, defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16, nonvolatile_anyextloadi16, bdxaddr20only>, Requires<[FeatureHighWord]>; -defm CondStore32Mux : CondStores<GRX32, nonvolatile_store, - nonvolatile_load, bdxaddr20only>, +defm CondStore32Mux : CondStores<GRX32, simple_store, + simple_load, bdxaddr20only>, Requires<[FeatureLoadStoreOnCond2]>; defm CondStore8 : CondStores<GR32, nonvolatile_truncstorei8, nonvolatile_anyextloadi8, bdxaddr20only>; defm CondStore16 : CondStores<GR32, nonvolatile_truncstorei16, nonvolatile_anyextloadi16, bdxaddr20only>; -defm CondStore32 : CondStores<GR32, nonvolatile_store, - nonvolatile_load, bdxaddr20only>; +defm CondStore32 : CondStores<GR32, simple_store, + simple_load, bdxaddr20only>; defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8, nonvolatile_anyextloadi8, bdxaddr20only>; @@ -353,8 +353,8 @@ defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16, nonvolatile_anyextloadi16, bdxaddr20only>; defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32, nonvolatile_anyextloadi32, bdxaddr20only>; -defm CondStore64 : CondStores<GR64, nonvolatile_store, - nonvolatile_load, bdxaddr20only>; +defm CondStore64 : CondStores<GR64, simple_store, + simple_load, bdxaddr20only>; //===----------------------------------------------------------------------===// // Move instructions @@ -531,8 +531,8 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in { // Load on condition. Matched via DAG pattern. // Expands to LOC or LOCFH, depending on the choice of register. - def LOCMux : CondUnaryRSYPseudo<nonvolatile_load, GRX32, 4>; - defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, nonvolatile_load, GRH32, 4>; + def LOCMux : CondUnaryRSYPseudo<simple_load, GRX32, 4>; + defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, simple_load, GRH32, 4>; // Store on condition. Expanded from CondStore* pseudos. // Expands to STOC or STOCFH, depending on the choice of register. @@ -563,8 +563,8 @@ let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in { } // Load on condition. Matched via DAG pattern. - defm LOC : CondUnaryRSYPair<"loc", 0xEBF2, nonvolatile_load, GR32, 4>; - defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, nonvolatile_load, GR64, 8>; + defm LOC : CondUnaryRSYPair<"loc", 0xEBF2, simple_load, GR32, 4>; + defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, simple_load, GR64, 8>; // Store on condition. Expanded from CondStore* pseudos. defm STOC : CondStoreRSYPair<"stoc", 0xEBF3, GR32, 4>; @@ -2082,7 +2082,7 @@ let Predicates = [FeatureProcessorAssist] in { // cleared. We only use the first result here. let Defs = [CC] in def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>; -def : Pat<(ctlz GR64:$src), +def : Pat<(i64 (ctlz GR64:$src)), (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>; // Population count. Counts bits set per byte or doubleword. diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 261727f89058..02364bbda5c1 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -60,7 +60,7 @@ let Predicates = [FeatureVector] in { // Generate byte mask. def VZERO : InherentVRIa<"vzero", 0xE744, 0>; def VONE : InherentVRIa<"vone", 0xE744, 0xffff>; - def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>; + def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16_timm>; // Generate mask. def VGM : BinaryVRIbGeneric<"vgm", 0xE746>; @@ -71,10 +71,10 @@ let Predicates = [FeatureVector] in { // Replicate immediate. def VREPI : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>; - def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>; - def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>; - def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>; - def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>; + def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16_timm, 0>; + def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16_timm, 1>; + def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16_timm, 2>; + def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16_timm, 3>; } // Load element immediate. @@ -116,7 +116,7 @@ let Predicates = [FeatureVector] in { (ins bdxaddr12only:$XBD2, imm32zx4:$M3), "lcbb\t$R1, $XBD2, $M3", [(set GR32:$R1, (int_s390_lcbb bdxaddr12only:$XBD2, - imm32zx4:$M3))]>; + imm32zx4_timm:$M3))]>; // Load with length. The number of loaded bytes is only known at run time. def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>; @@ -362,9 +362,9 @@ let Predicates = [FeatureVector] in { def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>; def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>; def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>; - def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)), + def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16_timm:$index)), (VREPF VR128:$vec, imm32zx16:$index)>; - def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)), + def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16_timm:$index)), (VREPG VR128:$vec, imm32zx16:$index)>; // Select. @@ -778,7 +778,7 @@ let Predicates = [FeatureVector] in { // Shift left double by byte. def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>; - def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z), + def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8_timm:$z), (VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>; // Shift left double by bit. @@ -1069,7 +1069,7 @@ let Predicates = [FeatureVector] in { def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>; } // Rounding mode should agree with SystemZInstrFP.td. - def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>; + def : FPConversion<VCGDB, any_fp_to_sint, v128g, v128db, 0, 5>; let Predicates = [FeatureVectorEnhancements2] in { let Uses = [FPC], mayRaiseFPException = 1 in { let isAsmParserOnly = 1 in @@ -1078,7 +1078,7 @@ let Predicates = [FeatureVector] in { def WCFEB : TernaryVRRa<"wcfeb", 0xE7C2, null_frag, v32sb, v32f, 2, 8>; } // Rounding mode should agree with SystemZInstrFP.td. - def : FPConversion<VCFEB, fp_to_sint, v128f, v128sb, 0, 5>; + def : FPConversion<VCFEB, any_fp_to_sint, v128f, v128sb, 0, 5>; } // Convert to logical. @@ -1088,7 +1088,7 @@ let Predicates = [FeatureVector] in { def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>; } // Rounding mode should agree with SystemZInstrFP.td. - def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>; + def : FPConversion<VCLGDB, any_fp_to_uint, v128g, v128db, 0, 5>; let Predicates = [FeatureVectorEnhancements2] in { let Uses = [FPC], mayRaiseFPException = 1 in { let isAsmParserOnly = 1 in @@ -1097,7 +1097,7 @@ let Predicates = [FeatureVector] in { def WCLFEB : TernaryVRRa<"wclfeb", 0xE7C0, null_frag, v32sb, v32f, 2, 8>; } // Rounding mode should agree with SystemZInstrFP.td. - def : FPConversion<VCLFEB, fp_to_uint, v128f, v128sb, 0, 5>; + def : FPConversion<VCLFEB, any_fp_to_uint, v128f, v128sb, 0, 5>; } // Divide. diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp index 95d7e22dec32..724111229569 100644 --- a/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -85,9 +85,9 @@ struct MBBInfo { // This value never changes. uint64_t Size = 0; - // The minimum alignment of the block, as a log2 value. + // The minimum alignment of the block. // This value never changes. - unsigned Alignment = 0; + Align Alignment; // The number of terminators in this block. This value never changes. unsigned NumTerminators = 0; @@ -127,7 +127,8 @@ struct BlockPosition { // as the runtime address. unsigned KnownBits; - BlockPosition(unsigned InitialAlignment) : KnownBits(InitialAlignment) {} + BlockPosition(unsigned InitialLogAlignment) + : KnownBits(InitialLogAlignment) {} }; class SystemZLongBranch : public MachineFunctionPass { @@ -178,17 +179,16 @@ const uint64_t MaxForwardRange = 0xfffe; // instructions. void SystemZLongBranch::skipNonTerminators(BlockPosition &Position, MBBInfo &Block) { - if (Block.Alignment > Position.KnownBits) { + if (Log2(Block.Alignment) > Position.KnownBits) { // When calculating the address of Block, we need to conservatively // assume that Block had the worst possible misalignment. - Position.Address += ((uint64_t(1) << Block.Alignment) - - (uint64_t(1) << Position.KnownBits)); - Position.KnownBits = Block.Alignment; + Position.Address += + (Block.Alignment.value() - (uint64_t(1) << Position.KnownBits)); + Position.KnownBits = Log2(Block.Alignment); } // Align the addresses. - uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1; - Position.Address = (Position.Address + AlignMask) & ~AlignMask; + Position.Address = alignTo(Position.Address, Block.Alignment); // Record the block's position. Block.Address = Position.Address; @@ -257,7 +257,7 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) { } Terminator.Branch = &MI; Terminator.TargetBlock = - TII->getBranchInfo(MI).Target->getMBB()->getNumber(); + TII->getBranchInfo(MI).getMBBTarget()->getNumber(); } return Terminator; } @@ -275,7 +275,7 @@ uint64_t SystemZLongBranch::initMBBInfo() { Terminators.clear(); Terminators.reserve(NumBlocks); - BlockPosition Position(MF->getAlignment()); + BlockPosition Position(Log2(MF->getAlignment())); for (unsigned I = 0; I < NumBlocks; ++I) { MachineBasicBlock *MBB = MF->getBlockNumbered(I); MBBInfo &Block = MBBs[I]; @@ -339,7 +339,7 @@ bool SystemZLongBranch::mustRelaxABranch() { // must be long. void SystemZLongBranch::setWorstCaseAddresses() { SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin(); - BlockPosition Position(MF->getAlignment()); + BlockPosition Position(Log2(MF->getAlignment())); for (auto &Block : MBBs) { skipNonTerminators(Position, Block); for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) { @@ -440,7 +440,7 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) { // Run a shortening pass and relax any branches that need to be relaxed. void SystemZLongBranch::relaxBranches() { SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin(); - BlockPosition Position(MF->getAlignment()); + BlockPosition Position(Log2(MF->getAlignment())); for (auto &Block : MBBs) { skipNonTerminators(Position, Block); for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) { diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp index 0becfaa1d49c..3fc25034dded 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "SystemZMachineScheduler.h" +#include "llvm/CodeGen/MachineLoopInfo.h" using namespace llvm; @@ -108,8 +109,8 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) { I != SinglePredMBB->end(); I++) { LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; I->dump();); bool TakenBranch = (I->isBranch() && - (TII->getBranchInfo(*I).Target->isReg() || // Relative branch - TII->getBranchInfo(*I).Target->getMBB() == MBB)); + (TII->getBranchInfo(*I).isIndirect() || + TII->getBranchInfo(*I).getMBBTarget() == MBB)); HazardRec->emitInstruction(&*I, TakenBranch); if (TakenBranch) break; diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td index 56632e1529a2..b2bab68a6274 100644 --- a/lib/Target/SystemZ/SystemZOperands.td +++ b/lib/Target/SystemZ/SystemZOperands.td @@ -21,15 +21,32 @@ class ImmediateTLSAsmOperand<string name> let RenderMethod = "addImmTLSOperands"; } +class ImmediateOp<ValueType vt, string asmop> : Operand<vt> { + let PrintMethod = "print"##asmop##"Operand"; + let DecoderMethod = "decode"##asmop##"Operand"; + let ParserMatchClass = !cast<AsmOperandClass>(asmop); +} + +class ImmOpWithPattern<ValueType vt, string asmop, code pred, SDNodeXForm xform, + SDNode ImmNode = imm> : + ImmediateOp<vt, asmop>, PatLeaf<(vt ImmNode), pred, xform>; + +// class ImmediatePatLeaf<ValueType vt, code pred, +// SDNodeXForm xform, SDNode ImmNode> +// : PatLeaf<(vt ImmNode), pred, xform>; + + // Constructs both a DAG pattern and instruction operand for an immediate // of type VT. PRED returns true if a node is acceptable and XFORM returns // the operand value associated with the node. ASMOP is the name of the // associated asm operand, and also forms the basis of the asm print method. -class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop> - : PatLeaf<(vt imm), pred, xform>, Operand<vt> { - let PrintMethod = "print"##asmop##"Operand"; - let DecoderMethod = "decode"##asmop##"Operand"; - let ParserMatchClass = !cast<AsmOperandClass>(asmop); +multiclass Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop> { + // def "" : ImmediateOp<vt, asmop>, + // PatLeaf<(vt imm), pred, xform>; + def "" : ImmOpWithPattern<vt, asmop, pred, xform>; + +// def _timm : PatLeaf<(vt timm), pred, xform>; + def _timm : ImmOpWithPattern<vt, asmop, pred, xform, timm>; } // Constructs an asm operand for a PC-relative address. SIZE says how @@ -295,87 +312,87 @@ def U48Imm : ImmediateAsmOperand<"U48Imm">; // Immediates for the lower and upper 16 bits of an i32, with the other // bits of the i32 being zero. -def imm32ll16 : Immediate<i32, [{ +defm imm32ll16 : Immediate<i32, [{ return SystemZ::isImmLL(N->getZExtValue()); }], LL16, "U16Imm">; -def imm32lh16 : Immediate<i32, [{ +defm imm32lh16 : Immediate<i32, [{ return SystemZ::isImmLH(N->getZExtValue()); }], LH16, "U16Imm">; // Immediates for the lower and upper 16 bits of an i32, with the other // bits of the i32 being one. -def imm32ll16c : Immediate<i32, [{ +defm imm32ll16c : Immediate<i32, [{ return SystemZ::isImmLL(uint32_t(~N->getZExtValue())); }], LL16, "U16Imm">; -def imm32lh16c : Immediate<i32, [{ +defm imm32lh16c : Immediate<i32, [{ return SystemZ::isImmLH(uint32_t(~N->getZExtValue())); }], LH16, "U16Imm">; // Short immediates -def imm32zx1 : Immediate<i32, [{ +defm imm32zx1 : Immediate<i32, [{ return isUInt<1>(N->getZExtValue()); }], NOOP_SDNodeXForm, "U1Imm">; -def imm32zx2 : Immediate<i32, [{ +defm imm32zx2 : Immediate<i32, [{ return isUInt<2>(N->getZExtValue()); }], NOOP_SDNodeXForm, "U2Imm">; -def imm32zx3 : Immediate<i32, [{ +defm imm32zx3 : Immediate<i32, [{ return isUInt<3>(N->getZExtValue()); }], NOOP_SDNodeXForm, "U3Imm">; -def imm32zx4 : Immediate<i32, [{ +defm imm32zx4 : Immediate<i32, [{ return isUInt<4>(N->getZExtValue()); }], NOOP_SDNodeXForm, "U4Imm">; // Note: this enforces an even value during code generation only. // When used from the assembler, any 4-bit value is allowed. -def imm32zx4even : Immediate<i32, [{ +defm imm32zx4even : Immediate<i32, [{ return isUInt<4>(N->getZExtValue()); }], UIMM8EVEN, "U4Imm">; -def imm32zx6 : Immediate<i32, [{ +defm imm32zx6 : Immediate<i32, [{ return isUInt<6>(N->getZExtValue()); }], NOOP_SDNodeXForm, "U6Imm">; -def imm32sx8 : Immediate<i32, [{ +defm imm32sx8 : Immediate<i32, [{ return isInt<8>(N->getSExtValue()); }], SIMM8, "S8Imm">; -def imm32zx8 : Immediate<i32, [{ +defm imm32zx8 : Immediate<i32, [{ return isUInt<8>(N->getZExtValue()); }], UIMM8, "U8Imm">; -def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">; +defm imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">; -def imm32zx12 : Immediate<i32, [{ +defm imm32zx12 : Immediate<i32, [{ return isUInt<12>(N->getZExtValue()); }], UIMM12, "U12Imm">; -def imm32sx16 : Immediate<i32, [{ +defm imm32sx16 : Immediate<i32, [{ return isInt<16>(N->getSExtValue()); }], SIMM16, "S16Imm">; -def imm32sx16n : Immediate<i32, [{ +defm imm32sx16n : Immediate<i32, [{ return isInt<16>(-N->getSExtValue()); }], NEGSIMM16, "S16Imm">; -def imm32zx16 : Immediate<i32, [{ +defm imm32zx16 : Immediate<i32, [{ return isUInt<16>(N->getZExtValue()); }], UIMM16, "U16Imm">; -def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">; -def imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">; +defm imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">; +defm imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">; // Full 32-bit immediates. we need both signed and unsigned versions // because the assembler is picky. E.g. AFI requires signed operands // while NILF requires unsigned ones. -def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">; -def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">; +defm simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">; +defm uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">; -def simm32n : Immediate<i32, [{ +defm simm32n : Immediate<i32, [{ return isInt<32>(-N->getSExtValue()); }], NEGSIMM32, "S32Imm">; @@ -387,107 +404,107 @@ def imm32 : ImmLeaf<i32, [{}]>; // Immediates for 16-bit chunks of an i64, with the other bits of the // i32 being zero. -def imm64ll16 : Immediate<i64, [{ +defm imm64ll16 : Immediate<i64, [{ return SystemZ::isImmLL(N->getZExtValue()); }], LL16, "U16Imm">; -def imm64lh16 : Immediate<i64, [{ +defm imm64lh16 : Immediate<i64, [{ return SystemZ::isImmLH(N->getZExtValue()); }], LH16, "U16Imm">; -def imm64hl16 : Immediate<i64, [{ +defm imm64hl16 : Immediate<i64, [{ return SystemZ::isImmHL(N->getZExtValue()); }], HL16, "U16Imm">; -def imm64hh16 : Immediate<i64, [{ +defm imm64hh16 : Immediate<i64, [{ return SystemZ::isImmHH(N->getZExtValue()); }], HH16, "U16Imm">; // Immediates for 16-bit chunks of an i64, with the other bits of the // i32 being one. -def imm64ll16c : Immediate<i64, [{ +defm imm64ll16c : Immediate<i64, [{ return SystemZ::isImmLL(uint64_t(~N->getZExtValue())); }], LL16, "U16Imm">; -def imm64lh16c : Immediate<i64, [{ +defm imm64lh16c : Immediate<i64, [{ return SystemZ::isImmLH(uint64_t(~N->getZExtValue())); }], LH16, "U16Imm">; -def imm64hl16c : Immediate<i64, [{ +defm imm64hl16c : Immediate<i64, [{ return SystemZ::isImmHL(uint64_t(~N->getZExtValue())); }], HL16, "U16Imm">; -def imm64hh16c : Immediate<i64, [{ +defm imm64hh16c : Immediate<i64, [{ return SystemZ::isImmHH(uint64_t(~N->getZExtValue())); }], HH16, "U16Imm">; // Immediates for the lower and upper 32 bits of an i64, with the other // bits of the i32 being zero. -def imm64lf32 : Immediate<i64, [{ +defm imm64lf32 : Immediate<i64, [{ return SystemZ::isImmLF(N->getZExtValue()); }], LF32, "U32Imm">; -def imm64hf32 : Immediate<i64, [{ +defm imm64hf32 : Immediate<i64, [{ return SystemZ::isImmHF(N->getZExtValue()); }], HF32, "U32Imm">; // Immediates for the lower and upper 32 bits of an i64, with the other // bits of the i32 being one. -def imm64lf32c : Immediate<i64, [{ +defm imm64lf32c : Immediate<i64, [{ return SystemZ::isImmLF(uint64_t(~N->getZExtValue())); }], LF32, "U32Imm">; -def imm64hf32c : Immediate<i64, [{ +defm imm64hf32c : Immediate<i64, [{ return SystemZ::isImmHF(uint64_t(~N->getZExtValue())); }], HF32, "U32Imm">; // Negated immediates that fit LF32 or LH16. -def imm64lh16n : Immediate<i64, [{ +defm imm64lh16n : Immediate<i64, [{ return SystemZ::isImmLH(uint64_t(-N->getZExtValue())); }], NEGLH16, "U16Imm">; -def imm64lf32n : Immediate<i64, [{ +defm imm64lf32n : Immediate<i64, [{ return SystemZ::isImmLF(uint64_t(-N->getZExtValue())); }], NEGLF32, "U32Imm">; // Short immediates. -def imm64sx8 : Immediate<i64, [{ +defm imm64sx8 : Immediate<i64, [{ return isInt<8>(N->getSExtValue()); }], SIMM8, "S8Imm">; -def imm64zx8 : Immediate<i64, [{ +defm imm64zx8 : Immediate<i64, [{ return isUInt<8>(N->getSExtValue()); }], UIMM8, "U8Imm">; -def imm64sx16 : Immediate<i64, [{ +defm imm64sx16 : Immediate<i64, [{ return isInt<16>(N->getSExtValue()); }], SIMM16, "S16Imm">; -def imm64sx16n : Immediate<i64, [{ +defm imm64sx16n : Immediate<i64, [{ return isInt<16>(-N->getSExtValue()); }], NEGSIMM16, "S16Imm">; -def imm64zx16 : Immediate<i64, [{ +defm imm64zx16 : Immediate<i64, [{ return isUInt<16>(N->getZExtValue()); }], UIMM16, "U16Imm">; -def imm64sx32 : Immediate<i64, [{ +defm imm64sx32 : Immediate<i64, [{ return isInt<32>(N->getSExtValue()); }], SIMM32, "S32Imm">; -def imm64sx32n : Immediate<i64, [{ +defm imm64sx32n : Immediate<i64, [{ return isInt<32>(-N->getSExtValue()); }], NEGSIMM32, "S32Imm">; -def imm64zx32 : Immediate<i64, [{ +defm imm64zx32 : Immediate<i64, [{ return isUInt<32>(N->getZExtValue()); }], UIMM32, "U32Imm">; -def imm64zx32n : Immediate<i64, [{ +defm imm64zx32n : Immediate<i64, [{ return isUInt<32>(-N->getSExtValue()); }], NEGUIMM32, "U32Imm">; -def imm64zx48 : Immediate<i64, [{ +defm imm64zx48 : Immediate<i64, [{ return isUInt<64>(N->getZExtValue()); }], UIMM48, "U48Imm">; @@ -637,7 +654,7 @@ def bdvaddr12only : BDVMode< "64", "12">; //===----------------------------------------------------------------------===// // A 4-bit condition-code mask. -def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>, +def cond4 : PatLeaf<(i32 timm), [{ return (N->getZExtValue() < 16); }]>, Operand<i32> { let PrintMethod = "printCond4Operand"; } diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 15bd12bc98a4..6fe383e64b74 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -472,17 +472,17 @@ def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs), (z_subcarry_1 node:$lhs, node:$rhs, CC)>; // Signed and unsigned comparisons. -def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{ +def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{ unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); return Type != SystemZICMP::UnsignedOnly; }]>; -def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{ +def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{ unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); return Type != SystemZICMP::SignedOnly; }]>; // Register- and memory-based TEST UNDER MASK. -def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>; +def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, timm)>; def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>; // Register sign-extend operations. Sub-32-bit values are represented as i32s. diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td index beaf4de285a3..65300fb47627 100644 --- a/lib/Target/SystemZ/SystemZPatterns.td +++ b/lib/Target/SystemZ/SystemZPatterns.td @@ -100,12 +100,12 @@ multiclass CondStores64<Instruction insn, Instruction insninv, SDPatternOperator store, SDPatternOperator load, AddressingMode mode> { def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr), - imm32zx4:$valid, imm32zx4:$cc), + imm32zx4_timm:$valid, imm32zx4_timm:$cc), mode:$addr), (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr, imm32zx4:$valid, imm32zx4:$cc)>; def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new, - imm32zx4:$valid, imm32zx4:$cc), + imm32zx4_timm:$valid, imm32zx4_timm:$cc), mode:$addr), (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr, imm32zx4:$valid, imm32zx4:$cc)>; diff --git a/lib/Target/SystemZ/SystemZPostRewrite.cpp b/lib/Target/SystemZ/SystemZPostRewrite.cpp index 8e4060eac74c..aaa7f8fc88f5 100644 --- a/lib/Target/SystemZ/SystemZPostRewrite.cpp +++ b/lib/Target/SystemZ/SystemZPostRewrite.cpp @@ -25,6 +25,7 @@ using namespace llvm; #define DEBUG_TYPE "systemz-postrewrite" STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops."); +STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)"); namespace llvm { void initializeSystemZPostRewritePass(PassRegistry&); @@ -45,12 +46,20 @@ public: StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - private: + void selectLOCRMux(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned LowOpcode, + unsigned HighOpcode); + void selectSELRMux(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned LowOpcode, + unsigned HighOpcode); + bool expandCondMove(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); bool selectMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool selectMBB(MachineBasicBlock &MBB); @@ -68,11 +77,141 @@ FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) { return new SystemZPostRewrite(); } +// MI is a load-register-on-condition pseudo instruction. Replace it with +// LowOpcode if source and destination are both low GR32s and HighOpcode if +// source and destination are both high GR32s. Otherwise, a branch sequence +// is created. +void SystemZPostRewrite::selectLOCRMux(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned LowOpcode, + unsigned HighOpcode) { + Register DestReg = MBBI->getOperand(0).getReg(); + Register SrcReg = MBBI->getOperand(2).getReg(); + bool DestIsHigh = SystemZ::isHighReg(DestReg); + bool SrcIsHigh = SystemZ::isHighReg(SrcReg); + + if (!DestIsHigh && !SrcIsHigh) + MBBI->setDesc(TII->get(LowOpcode)); + else if (DestIsHigh && SrcIsHigh) + MBBI->setDesc(TII->get(HighOpcode)); + else + expandCondMove(MBB, MBBI, NextMBBI); +} + +// MI is a select pseudo instruction. Replace it with LowOpcode if source +// and destination are all low GR32s and HighOpcode if source and destination +// are all high GR32s. Otherwise, a branch sequence is created. +void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned LowOpcode, + unsigned HighOpcode) { + Register DestReg = MBBI->getOperand(0).getReg(); + Register Src1Reg = MBBI->getOperand(1).getReg(); + Register Src2Reg = MBBI->getOperand(2).getReg(); + bool DestIsHigh = SystemZ::isHighReg(DestReg); + bool Src1IsHigh = SystemZ::isHighReg(Src1Reg); + bool Src2IsHigh = SystemZ::isHighReg(Src2Reg); + + // If sources and destination aren't all high or all low, we may be able to + // simplify the operation by moving one of the sources to the destination + // first. But only if this doesn't clobber the other source. + if (DestReg != Src1Reg && DestReg != Src2Reg) { + if (DestIsHigh != Src1IsHigh) { + BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), + TII->get(SystemZ::COPY), DestReg) + .addReg(MBBI->getOperand(1).getReg(), getRegState(MBBI->getOperand(1))); + MBBI->getOperand(1).setReg(DestReg); + Src1Reg = DestReg; + Src1IsHigh = DestIsHigh; + } else if (DestIsHigh != Src2IsHigh) { + BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), + TII->get(SystemZ::COPY), DestReg) + .addReg(MBBI->getOperand(2).getReg(), getRegState(MBBI->getOperand(2))); + MBBI->getOperand(2).setReg(DestReg); + Src2Reg = DestReg; + Src2IsHigh = DestIsHigh; + } + } + + // If the destination (now) matches one source, prefer this to be first. + if (DestReg != Src1Reg && DestReg == Src2Reg) { + TII->commuteInstruction(*MBBI, false, 1, 2); + std::swap(Src1Reg, Src2Reg); + std::swap(Src1IsHigh, Src2IsHigh); + } + + if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh) + MBBI->setDesc(TII->get(LowOpcode)); + else if (DestIsHigh && Src1IsHigh && Src2IsHigh) + MBBI->setDesc(TII->get(HighOpcode)); + else + // Given the simplification above, we must already have a two-operand case. + expandCondMove(MBB, MBBI, NextMBBI); +} + +// Replace MBBI by a branch sequence that performs a conditional move of +// operand 2 to the destination register. Operand 1 is expected to be the +// same register as the destination. +bool SystemZPostRewrite::expandCondMove(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineFunction &MF = *MBB.getParent(); + const BasicBlock *BB = MBB.getBasicBlock(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + Register DestReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + unsigned CCValid = MI.getOperand(3).getImm(); + unsigned CCMask = MI.getOperand(4).getImm(); + assert(DestReg == MI.getOperand(1).getReg() && + "Expected destination and first source operand to be the same."); + + LivePhysRegs LiveRegs(TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + // Splice MBB at MI, moving the rest of the block into RestMBB. + MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB); + MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB); + RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end()); + RestMBB->transferSuccessors(&MBB); + for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) + RestMBB->addLiveIn(*I); + + // Create a new block MoveMBB to hold the move instruction. + MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB); + MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB); + MoveMBB->addLiveIn(SrcReg); + for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) + MoveMBB->addLiveIn(*I); + + // At the end of MBB, create a conditional branch to RestMBB if the + // condition is false, otherwise fall through to MoveMBB. + BuildMI(&MBB, DL, TII->get(SystemZ::BRC)) + .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB); + MBB.addSuccessor(RestMBB); + MBB.addSuccessor(MoveMBB); + + // In MoveMBB, emit an instruction to move SrcReg into DestReg, + // then fall through to RestMBB. + BuildMI(*MoveMBB, MoveMBB->end(), DL, TII->get(SystemZ::COPY), DestReg) + .addReg(MI.getOperand(2).getReg(), getRegState(MI.getOperand(2))); + MoveMBB->addSuccessor(RestMBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + LOCRMuxJumps++; + return true; +} + /// If MBBI references a pseudo instruction that should be selected here, /// do it and return true. Otherwise return false. bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI) { + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); @@ -83,7 +222,7 @@ bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB, if (TargetMemOpcode != -1) { MI.setDesc(TII->get(TargetMemOpcode)); MI.tieOperands(0, 1); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); MachineOperand &SrcMO = MI.getOperand(1); if (DstReg != SrcMO.getReg()) { BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), DstReg) @@ -94,6 +233,15 @@ bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB, return true; } + switch (Opcode) { + case SystemZ::LOCRMux: + selectLOCRMux(MBB, MBBI, NextMBBI, SystemZ::LOCR, SystemZ::LOCFHR); + return true; + case SystemZ::SELRMux: + selectSELRMux(MBB, MBBI, NextMBBI, SystemZ::SELR, SystemZ::SELFHR); + return true; + } + return false; } diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td index b27c25beb58c..af33a0300552 100644 --- a/lib/Target/SystemZ/SystemZProcessors.td +++ b/lib/Target/SystemZ/SystemZProcessors.td @@ -35,5 +35,6 @@ def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>; def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>; def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>; -def : ProcessorModel<"arch13", Arch13Model, Arch13SupportedFeatures.List>; +def : ProcessorModel<"arch13", Z15Model, Arch13SupportedFeatures.List>; +def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index e7cd6871dbb4..39ace5594b7f 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -41,7 +41,7 @@ static const TargetRegisterClass *getRC32(MachineOperand &MO, return &SystemZ::GRH32BitRegClass; if (VRM && VRM->hasPhys(MO.getReg())) { - unsigned PhysReg = VRM->getPhys(MO.getReg()); + Register PhysReg = VRM->getPhys(MO.getReg()); if (SystemZ::GR32BitRegClass.contains(PhysReg)) return &SystemZ::GR32BitRegClass; assert (SystemZ::GRH32BitRegClass.contains(PhysReg) && @@ -120,8 +120,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, } // Add the other operand of the LOCRMux to the worklist. - unsigned OtherReg = - (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg()); + Register OtherReg = + (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg()); if (MRI->getRegClass(OtherReg) == &SystemZ::GRX32BitRegClass) Worklist.push_back(OtherReg); } // end LOCRMux @@ -169,7 +169,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, auto tryAddHint = [&](const MachineOperand *MO) -> void { Register Reg = MO->getReg(); - Register PhysReg = isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg); + Register PhysReg = + Register::isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg); if (PhysReg) { if (MO->getSubReg()) PhysReg = getSubReg(PhysReg, MO->getSubReg()); @@ -297,8 +298,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, assert(Mask && "One offset must be OK"); } while (!OpcodeForOffset); - unsigned ScratchReg = - MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass); int64_t HighOffset = OldOffset - Offset; if (MI->getDesc().TSFlags & SystemZII::HasIndex @@ -351,8 +352,8 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI, // regalloc may run out of registers. unsigned WideOpNo = (getRegSizeInBits(*SrcRC) == 128 ? 1 : 0); - unsigned GR128Reg = MI->getOperand(WideOpNo).getReg(); - unsigned GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg(); + Register GR128Reg = MI->getOperand(WideOpNo).getReg(); + Register GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg(); LiveInterval &IntGR128 = LIS.getInterval(GR128Reg); LiveInterval &IntGRNar = LIS.getInterval(GRNarReg); @@ -385,7 +386,7 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI, MEE++; for (; MII != MEE; ++MII) { for (const MachineOperand &MO : MII->operands()) - if (MO.isReg() && isPhysicalRegister(MO.getReg())) { + if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) { for (MCSuperRegIterator SI(MO.getReg(), this, true/*IncludeSelf*/); SI.isValid(); ++SI) if (NewRC->contains(*SI)) { diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index 4f721ec23e53..7044efef1ac6 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -28,6 +28,15 @@ inline unsigned even128(bool Is32bit) { inline unsigned odd128(bool Is32bit) { return Is32bit ? subreg_l32 : subreg_l64; } + +// Reg should be a 32-bit GPR. Return true if it is a high register rather +// than a low register. +inline bool isHighReg(unsigned int Reg) { + if (SystemZ::GRH32BitRegClass.contains(Reg)) + return true; + assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32"); + return false; +} } // end namespace SystemZ struct SystemZRegisterInfo : public SystemZGenRegisterInfo { diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td index 98eca2802242..119e3ee7c22c 100644 --- a/lib/Target/SystemZ/SystemZSchedule.td +++ b/lib/Target/SystemZ/SystemZSchedule.td @@ -59,7 +59,7 @@ def VBU : SchedWrite; // Virtual branching unit def MCD : SchedWrite; // Millicode -include "SystemZScheduleArch13.td" +include "SystemZScheduleZ15.td" include "SystemZScheduleZ14.td" include "SystemZScheduleZ13.td" include "SystemZScheduleZEC12.td" diff --git a/lib/Target/SystemZ/SystemZScheduleArch13.td b/lib/Target/SystemZ/SystemZScheduleZ15.td index 9f82f24d0e8f..56ceb88f35d4 100644 --- a/lib/Target/SystemZ/SystemZScheduleArch13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -1,4 +1,4 @@ -//-- SystemZScheduleArch13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// +//-- SystemZScheduleZ15.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,14 +6,14 @@ // //===----------------------------------------------------------------------===// // -// This file defines the machine model for Arch13 to support instruction +// This file defines the machine model for Z15 to support instruction // scheduling and other instruction cost heuristics. // // Pseudos expanded right after isel do not need to be modelled here. // //===----------------------------------------------------------------------===// -def Arch13Model : SchedMachineModel { +def Z15Model : SchedMachineModel { let UnsupportedFeatures = Arch13UnsupportedFeatures.List; @@ -27,7 +27,7 @@ def Arch13Model : SchedMachineModel { let MispredictPenalty = 20; } -let SchedModel = Arch13Model in { +let SchedModel = Z15Model in { // These definitions need the SchedModel value. They could be put in a // subtarget common include file, but it seems the include system in Tablegen // currently (2016) rejects multiple includes of same file. @@ -73,43 +73,43 @@ let NumMicroOps = 0 in { } // Execution units. -def Arch13_FXaUnit : ProcResource<2>; -def Arch13_FXbUnit : ProcResource<2>; -def Arch13_LSUnit : ProcResource<2>; -def Arch13_VecUnit : ProcResource<2>; -def Arch13_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ } -def Arch13_VBUnit : ProcResource<2>; -def Arch13_MCD : ProcResource<1>; +def Z15_FXaUnit : ProcResource<2>; +def Z15_FXbUnit : ProcResource<2>; +def Z15_LSUnit : ProcResource<2>; +def Z15_VecUnit : ProcResource<2>; +def Z15_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ } +def Z15_VBUnit : ProcResource<2>; +def Z15_MCD : ProcResource<1>; // Subtarget specific definitions of scheduling resources. let NumMicroOps = 0 in { - def : WriteRes<FXa, [Arch13_FXaUnit]>; - def : WriteRes<FXb, [Arch13_FXbUnit]>; - def : WriteRes<LSU, [Arch13_LSUnit]>; - def : WriteRes<VecBF, [Arch13_VecUnit]>; - def : WriteRes<VecDF, [Arch13_VecUnit]>; - def : WriteRes<VecDFX, [Arch13_VecUnit]>; - def : WriteRes<VecMul, [Arch13_VecUnit]>; - def : WriteRes<VecStr, [Arch13_VecUnit]>; - def : WriteRes<VecXsPm, [Arch13_VecUnit]>; + def : WriteRes<FXa, [Z15_FXaUnit]>; + def : WriteRes<FXb, [Z15_FXbUnit]>; + def : WriteRes<LSU, [Z15_LSUnit]>; + def : WriteRes<VecBF, [Z15_VecUnit]>; + def : WriteRes<VecDF, [Z15_VecUnit]>; + def : WriteRes<VecDFX, [Z15_VecUnit]>; + def : WriteRes<VecMul, [Z15_VecUnit]>; + def : WriteRes<VecStr, [Z15_VecUnit]>; + def : WriteRes<VecXsPm, [Z15_VecUnit]>; foreach Num = 2-5 in { let ResourceCycles = [Num] in { - def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Arch13_FXaUnit]>; - def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Arch13_FXbUnit]>; - def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Arch13_LSUnit]>; - def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Arch13_VecUnit]>; - def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Arch13_VecUnit]>; - def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Arch13_VecUnit]>; - def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Arch13_VecUnit]>; - def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Arch13_VecUnit]>; - def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Arch13_VecUnit]>; + def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z15_FXaUnit]>; + def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z15_FXbUnit]>; + def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z15_LSUnit]>; + def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z15_VecUnit]>; + def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z15_VecUnit]>; + def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z15_VecUnit]>; + def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z15_VecUnit]>; + def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z15_VecUnit]>; + def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z15_VecUnit]>; }} - def : WriteRes<VecFPd, [Arch13_VecFPdUnit]> { let ResourceCycles = [30]; } + def : WriteRes<VecFPd, [Z15_VecFPdUnit]> { let ResourceCycles = [30]; } - def : WriteRes<VBU, [Arch13_VBUnit]>; // Virtual Branching Unit + def : WriteRes<VBU, [Z15_VBUnit]>; // Virtual Branching Unit } -def : WriteRes<MCD, [Arch13_MCD]> { let NumMicroOps = 3; +def : WriteRes<MCD, [Z15_MCD]> { let NumMicroOps = 3; let BeginGroup = 1; let EndGroup = 1; } diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index a50e6aa59711..47c925dcf730 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -209,10 +209,10 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr( // Now select between End and null, depending on whether the character // was found. - SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT), - DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32), - DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32), - CCReg}; + SDValue Ops[] = { + End, DAG.getConstant(0, DL, PtrVT), + DAG.getTargetConstant(SystemZ::CCMASK_SRST, DL, MVT::i32), + DAG.getTargetConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32), CCReg}; End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, PtrVT, Ops); return std::make_pair(End, Chain); } diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp index e79dfc5b4b9e..2aca22c9082a 100644 --- a/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -75,7 +75,7 @@ static void tieOpsIfNeeded(MachineInstr &MI) { // instead of IIxF. bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH) { - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); // The new opcode will clear the other half of the GR64 reg, so // cancel if that is live. unsigned thisSubRegIdx = @@ -86,7 +86,7 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL, : SystemZ::subreg_l32); unsigned GR64BitReg = TRI->getMatchingSuperReg(Reg, thisSubRegIdx, &SystemZ::GR64BitRegClass); - unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx); + Register OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx); if (LiveRegs.contains(OtherReg)) return false; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index 5c49e6eff0bf..20865037fe38 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -154,7 +154,7 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT, getEffectiveRelocModel(RM), getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT), OL), - TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()), + TLOF(std::make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } @@ -176,7 +176,7 @@ public: ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { return new ScheduleDAGMI(C, - llvm::make_unique<SystemZPostRASchedStrategy>(C), + std::make_unique<SystemZPostRASchedStrategy>(C), /*RemoveKillFlags=*/true); } @@ -184,6 +184,7 @@ public: bool addInstSelector() override; bool addILPOpts() override; void addPostRewrite() override; + void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; @@ -217,14 +218,14 @@ void SystemZPassConfig::addPostRewrite() { addPass(createSystemZPostRewritePass(getSystemZTargetMachine())); } -void SystemZPassConfig::addPreSched2() { +void SystemZPassConfig::addPostRegAlloc() { // PostRewrite needs to be run at -O0 also (in which case addPostRewrite() // is not called). if (getOptLevel() == CodeGenOpt::None) addPass(createSystemZPostRewritePass(getSystemZTargetMachine())); +} - addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine())); - +void SystemZPassConfig::addPreSched2() { if (getOptLevel() != CodeGenOpt::None) addPass(&IfConverterID); } diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 145cf87ef9f5..11c99aa11174 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -304,7 +304,8 @@ bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, C2.ScaleCost, C2.SetupCost); } -unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { +unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (!Vector) // Discount the stack pointer. Also leave out %r0, since it can't // be used in an address. @@ -707,7 +708,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // TODO: Fix base implementation which could simplify things a bit here // (seems to miss on differentiating on scalar/vector types). - // Only 64 bit vector conversions are natively supported before arch13. + // Only 64 bit vector conversions are natively supported before z15. if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { if (SrcScalarBits == DstScalarBits) return NumDstVectors; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 16ce2ef1d7a0..3ba80b31439f 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -56,12 +56,12 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; - unsigned getCacheLineSize() { return 256; } - unsigned getPrefetchDistance() { return 2000; } - unsigned getMinPrefetchStride() { return 2048; } + unsigned getCacheLineSize() const override { return 256; } + unsigned getPrefetchDistance() const override { return 2000; } + unsigned getMinPrefetchStride() const override { return 2048; } bool hasDivRemOp(Type *DataType, bool IsSigned); bool prefersVectorizedAddressing() { return false; } diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index 17274e1c2c6e..dcd3934de0fa 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -253,6 +253,7 @@ MCSection *TargetLoweringObjectFile::SectionForGlobal( auto Attrs = GVar->getAttributes(); if ((Attrs.hasAttribute("bss-section") && Kind.isBSS()) || (Attrs.hasAttribute("data-section") && Kind.isData()) || + (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) || (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly())) { return getExplicitSectionGlobal(GO, Kind, TM); } diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 634866d93570..4c98e140f446 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -63,18 +63,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const { RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math"); - RESET_OPTION(NoTrappingFPMath, "no-trapping-math"); - - StringRef Denormal = - F.getFnAttribute("denormal-fp-math").getValueAsString(); - if (Denormal == "ieee") - Options.FPDenormalMode = FPDenormal::IEEE; - else if (Denormal == "preserve-sign") - Options.FPDenormalMode = FPDenormal::PreserveSign; - else if (Denormal == "positive-zero") - Options.FPDenormalMode = FPDenormal::PositiveZero; - else - Options.FPDenormalMode = DefaultOptions.FPDenormalMode; } /// Returns the code generation relocation model. The choices are static, PIC, @@ -140,8 +128,8 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M, // don't assume the variables to be DSO local unless we actually know // that for sure. This only has to be done for variables; for functions // the linker can insert thunks for calling functions from another DLL. - if (TT.isWindowsGNUEnvironment() && GV && GV->isDeclarationForLinker() && - isa<GlobalVariable>(GV)) + if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV && + GV->isDeclarationForLinker() && isa<GlobalVariable>(GV)) return false; // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols @@ -154,7 +142,9 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M, // Make an exception for windows OS in the triple: Some firmware builds use // *-win32-macho triples. This (accidentally?) produced windows relocations // without GOT tables in older clang versions; Keep this behaviour. - if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO())) + // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables + // either. + if (TT.isOSBinFormatCOFF() || TT.isOSWindows()) return true; // Most PIC code sequences that assume that a symbol is local cannot diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp index 5d9029682fdd..3ac9c38dfc0b 100644 --- a/lib/Target/TargetMachineC.cpp +++ b/lib/Target/TargetMachineC.cpp @@ -219,7 +219,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M, LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M, char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) { std::error_code EC; - raw_fd_ostream dest(Filename, EC, sys::fs::F_None); + raw_fd_ostream dest(Filename, EC, sys::fs::OF_None); if (EC) { *ErrorMessage = strdup(EC.message().c_str()); return true; diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 09628e872dd5..53a96fd6a97d 100644 --- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -313,16 +313,17 @@ public: return Optional<wasm::ValType>(); } - WebAssembly::ExprType parseBlockType(StringRef ID) { - return StringSwitch<WebAssembly::ExprType>(ID) - .Case("i32", WebAssembly::ExprType::I32) - .Case("i64", WebAssembly::ExprType::I64) - .Case("f32", WebAssembly::ExprType::F32) - .Case("f64", WebAssembly::ExprType::F64) - .Case("v128", WebAssembly::ExprType::V128) - .Case("exnref", WebAssembly::ExprType::Exnref) - .Case("void", WebAssembly::ExprType::Void) - .Default(WebAssembly::ExprType::Invalid); + WebAssembly::BlockType parseBlockType(StringRef ID) { + // Multivalue block types are handled separately in parseSignature + return StringSwitch<WebAssembly::BlockType>(ID) + .Case("i32", WebAssembly::BlockType::I32) + .Case("i64", WebAssembly::BlockType::I64) + .Case("f32", WebAssembly::BlockType::F32) + .Case("f64", WebAssembly::BlockType::F64) + .Case("v128", WebAssembly::BlockType::V128) + .Case("exnref", WebAssembly::BlockType::Exnref) + .Case("void", WebAssembly::BlockType::Void) + .Default(WebAssembly::BlockType::Invalid); } bool parseRegTypeList(SmallVectorImpl<wasm::ValType> &Types) { @@ -343,7 +344,7 @@ public: int64_t Val = Int.getIntVal(); if (IsNegative) Val = -Val; - Operands.push_back(make_unique<WebAssemblyOperand>( + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Integer, Int.getLoc(), Int.getEndLoc(), WebAssemblyOperand::IntOp{Val})); Parser.Lex(); @@ -356,7 +357,7 @@ public: return error("Cannot parse real: ", Flt); if (IsNegative) Val = -Val; - Operands.push_back(make_unique<WebAssemblyOperand>( + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(), WebAssemblyOperand::FltOp{Val})); Parser.Lex(); @@ -378,7 +379,7 @@ public: } if (IsNegative) Val = -Val; - Operands.push_back(make_unique<WebAssemblyOperand>( + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(), WebAssemblyOperand::FltOp{Val})); Parser.Lex(); @@ -407,7 +408,7 @@ public: // an opcode until after the assembly matcher, so set a default to fix // up later. auto Tok = Lexer.getTok(); - Operands.push_back(make_unique<WebAssemblyOperand>( + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(), WebAssemblyOperand::IntOp{-1})); } @@ -416,8 +417,8 @@ public: } void addBlockTypeOperand(OperandVector &Operands, SMLoc NameLoc, - WebAssembly::ExprType BT) { - Operands.push_back(make_unique<WebAssemblyOperand>( + WebAssembly::BlockType BT) { + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Integer, NameLoc, NameLoc, WebAssemblyOperand::IntOp{static_cast<int64_t>(BT)})); } @@ -449,13 +450,14 @@ public: } // Now construct the name as first operand. - Operands.push_back(make_unique<WebAssemblyOperand>( + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()), WebAssemblyOperand::TokOp{Name})); // If this instruction is part of a control flow structure, ensure // proper nesting. bool ExpectBlockType = false; + bool ExpectFuncType = false; if (Name == "block") { push(Block); ExpectBlockType = true; @@ -489,9 +491,37 @@ public: if (pop(Name, Block)) return true; } else if (Name == "end_function") { + ensureLocals(getStreamer()); CurrentState = EndFunction; if (pop(Name, Function) || ensureEmptyNestingStack()) return true; + } else if (Name == "call_indirect" || Name == "return_call_indirect") { + ExpectFuncType = true; + } + + if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) { + // This has a special TYPEINDEX operand which in text we + // represent as a signature, such that we can re-build this signature, + // attach it to an anonymous symbol, which is what WasmObjectWriter + // expects to be able to recreate the actual unique-ified type indices. + auto Loc = Parser.getTok(); + auto Signature = std::make_unique<wasm::WasmSignature>(); + if (parseSignature(Signature.get())) + return true; + // Got signature as block type, don't need more + ExpectBlockType = false; + auto &Ctx = getStreamer().getContext(); + // The "true" here will cause this to be a nameless symbol. + MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true); + auto *WasmSym = cast<MCSymbolWasm>(Sym); + WasmSym->setSignature(Signature.get()); + addSignature(std::move(Signature)); + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + const MCExpr *Expr = MCSymbolRefExpr::create( + WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx); + Operands.push_back(std::make_unique<WebAssemblyOperand>( + WebAssemblyOperand::Symbol, Loc.getLoc(), Loc.getEndLoc(), + WebAssemblyOperand::SymOp{Expr})); } while (Lexer.isNot(AsmToken::EndOfStatement)) { @@ -504,7 +534,7 @@ public: if (ExpectBlockType) { // Assume this identifier is a block_type. auto BT = parseBlockType(Id.getString()); - if (BT == WebAssembly::ExprType::Invalid) + if (BT == WebAssembly::BlockType::Invalid) return error("Unknown block type: ", Id); addBlockTypeOperand(Operands, NameLoc, BT); Parser.Lex(); @@ -514,7 +544,7 @@ public: SMLoc End; if (Parser.parseExpression(Val, End)) return error("Cannot parse symbol: ", Lexer.getTok()); - Operands.push_back(make_unique<WebAssemblyOperand>( + Operands.push_back(std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(), WebAssemblyOperand::SymOp{Val})); if (checkForP2AlignIfLoadStore(Operands, Name)) @@ -549,7 +579,7 @@ public: } case AsmToken::LCurly: { Parser.Lex(); - auto Op = make_unique<WebAssemblyOperand>( + auto Op = std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc()); if (!Lexer.is(AsmToken::RCurly)) for (;;) { @@ -572,7 +602,7 @@ public: } if (ExpectBlockType && Operands.size() == 1) { // Support blocks with no operands as default to void. - addBlockTypeOperand(Operands, NameLoc, WebAssembly::ExprType::Void); + addBlockTypeOperand(Operands, NameLoc, WebAssembly::BlockType::Void); } Parser.Lex(); return false; @@ -671,7 +701,7 @@ public: LastFunctionLabel = LastLabel; push(Function); } - auto Signature = make_unique<wasm::WasmSignature>(); + auto Signature = std::make_unique<wasm::WasmSignature>(); if (parseSignature(Signature.get())) return true; WasmSym->setSignature(Signature.get()); @@ -687,7 +717,7 @@ public: if (SymName.empty()) return true; auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); - auto Signature = make_unique<wasm::WasmSignature>(); + auto Signature = std::make_unique<wasm::WasmSignature>(); if (parseRegTypeList(Signature->Params)) return true; WasmSym->setSignature(Signature.get()); @@ -737,24 +767,30 @@ public: return true; // We didn't process this directive. } + // Called either when the first instruction is parsed of the function ends. + void ensureLocals(MCStreamer &Out) { + if (CurrentState == FunctionStart) { + // We haven't seen a .local directive yet. The streamer requires locals to + // be encoded as a prelude to the instructions, so emit an empty list of + // locals here. + auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>( + *Out.getTargetStreamer()); + TOut.emitLocal(SmallVector<wasm::ValType, 0>()); + CurrentState = FunctionLocals; + } + } + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override { MCInst Inst; + Inst.setLoc(IDLoc); unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm); switch (MatchResult) { case Match_Success: { - if (CurrentState == FunctionStart) { - // This is the first instruction in a function, but we haven't seen - // a .local directive yet. The streamer requires locals to be encoded - // as a prelude to the instructions, so emit an empty list of locals - // here. - auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>( - *Out.getTargetStreamer()); - TOut.emitLocal(SmallVector<wasm::ValType, 0>()); - } + ensureLocals(Out); // Fix unknown p2align operands. auto Align = WebAssembly::GetDefaultP2AlignAny(Inst.getOpcode()); if (Align != -1U) { diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index f9bf3f85d30f..9a9c31cff2d5 100644 --- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -24,6 +24,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCSymbolWasm.h" #include "llvm/Support/Endian.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/TargetRegistry.h" @@ -213,10 +214,29 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( return MCDisassembler::Fail; break; } - // block_type operands (uint8_t). + // block_type operands: case WebAssembly::OPERAND_SIGNATURE: { - if (!parseImmediate<uint8_t>(MI, Size, Bytes)) + int64_t Val; + uint64_t PrevSize = Size; + if (!nextLEB(Val, Bytes, Size, true)) return MCDisassembler::Fail; + if (Val < 0) { + // Negative values are single septet value types or empty types + if (Size != PrevSize + 1) { + MI.addOperand( + MCOperand::createImm(int64_t(WebAssembly::BlockType::Invalid))); + } else { + MI.addOperand(MCOperand::createImm(Val & 0x7f)); + } + } else { + // We don't have access to the signature, so create a symbol without one + MCSymbol *Sym = getContext().createTempSymbol("typeindex", true); + auto *WasmSym = cast<MCSymbolWasm>(Sym); + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + const MCExpr *Expr = MCSymbolRefExpr::create( + WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, getContext()); + MI.addOperand(MCOperand::createExpr(Expr)); + } break; } // FP operands. diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 70b409cf4a90..8314de41021f 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -31,10 +31,12 @@ namespace { class WebAssemblyAsmBackend final : public MCAsmBackend { bool Is64Bit; + bool IsEmscripten; public: - explicit WebAssemblyAsmBackend(bool Is64Bit) - : MCAsmBackend(support::little), Is64Bit(Is64Bit) {} + explicit WebAssemblyAsmBackend(bool Is64Bit, bool IsEmscripten) + : MCAsmBackend(support::little), Is64Bit(Is64Bit), + IsEmscripten(IsEmscripten) {} unsigned getNumFixupKinds() const override { return WebAssembly::NumTargetFixupKinds; @@ -123,11 +125,11 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm, std::unique_ptr<MCObjectTargetWriter> WebAssemblyAsmBackend::createObjectTargetWriter() const { - return createWebAssemblyWasmObjectWriter(Is64Bit); + return createWebAssemblyWasmObjectWriter(Is64Bit, IsEmscripten); } } // end anonymous namespace MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) { - return new WebAssemblyAsmBackend(TT.isArch64Bit()); + return new WebAssemblyAsmBackend(TT.isArch64Bit(), TT.isOSEmscripten()); } diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index b5d4d369b726..221ac17b8336 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" #include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblyUtilities.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -51,7 +52,9 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // Print any additional variadic operands. const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - if (Desc.isVariadic()) + if (Desc.isVariadic()) { + if (Desc.getNumOperands() == 0 && MI->getNumOperands() > 0) + OS << "\t"; for (auto I = Desc.getNumOperands(), E = MI->getNumOperands(); I < E; ++I) { // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because // we have an extra flags operand which is not currently printed, for @@ -62,6 +65,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, OS << ", "; printOperand(MI, I, OS); } + } // Print any added annotation. printAnnotation(OS, Annot); @@ -232,7 +236,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI); + // call_indirect instructions have a TYPEINDEX operand that we print + // as a signature here, such that the assembler can recover this + // information. + auto SRE = static_cast<const MCSymbolRefExpr *>(Op.getExpr()); + if (SRE->getKind() == MCSymbolRefExpr::VK_WASM_TYPEINDEX) { + auto &Sym = static_cast<const MCSymbolWasm &>(SRE->getSymbol()); + O << WebAssembly::signatureToString(Sym.getSignature()); + } else { + Op.getExpr()->print(O, &MAI); + } } } @@ -259,14 +272,26 @@ void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI, void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - auto Imm = static_cast<unsigned>(MI->getOperand(OpNo).getImm()); - if (Imm != wasm::WASM_TYPE_NORESULT) - O << WebAssembly::anyTypeToString(Imm); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) { + auto Imm = static_cast<unsigned>(Op.getImm()); + if (Imm != wasm::WASM_TYPE_NORESULT) + O << WebAssembly::anyTypeToString(Imm); + } else { + auto Expr = cast<MCSymbolRefExpr>(Op.getExpr()); + auto *Sym = cast<MCSymbolWasm>(&Expr->getSymbol()); + if (Sym->getSignature()) { + O << WebAssembly::signatureToString(Sym->getSignature()); + } else { + // Disassembler does not currently produce a signature + O << "unknown_type"; + } + } } // We have various enums representing a subset of these types, use this // function to convert any of them to text. -const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) { +const char *WebAssembly::anyTypeToString(unsigned Ty) { switch (Ty) { case wasm::WASM_TYPE_I32: return "i32"; @@ -291,6 +316,24 @@ const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) { } } -const char *llvm::WebAssembly::typeToString(wasm::ValType Ty) { +const char *WebAssembly::typeToString(wasm::ValType Ty) { return anyTypeToString(static_cast<unsigned>(Ty)); } + +std::string WebAssembly::typeListToString(ArrayRef<wasm::ValType> List) { + std::string S; + for (auto &Ty : List) { + if (&Ty != &List[0]) S += ", "; + S += WebAssembly::typeToString(Ty); + } + return S; +} + +std::string WebAssembly::signatureToString(const wasm::WasmSignature *Sig) { + std::string S("("); + S += typeListToString(Sig->Params); + S += ") -> ("; + S += typeListToString(Sig->Returns); + S += ")"; + return S; +} diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h index b979de5028bf..cf37778099a0 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h @@ -58,6 +58,9 @@ namespace WebAssembly { const char *typeToString(wasm::ValType Ty); const char *anyTypeToString(unsigned Ty); +std::string typeListToString(ArrayRef<wasm::ValType> List); +std::string signatureToString(const wasm::WasmSignature *Sig); + } // end namespace WebAssembly } // end namespace llvm diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index 44b6d6a968a9..1a4c57e66d2f 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -152,6 +152,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( break; case WebAssembly::OPERAND_FUNCTION32: case WebAssembly::OPERAND_OFFSET32: + case WebAssembly::OPERAND_SIGNATURE: case WebAssembly::OPERAND_TYPEINDEX: case WebAssembly::OPERAND_GLOBAL: case WebAssembly::OPERAND_EVENT: diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 7a9f59b1a4f2..b339860a381d 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -38,7 +38,7 @@ MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII); MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT); std::unique_ptr<MCObjectTargetWriter> -createWebAssemblyWasmObjectWriter(bool Is64Bit); +createWebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten); namespace WebAssembly { enum OperandType { @@ -122,16 +122,22 @@ enum TOF { namespace llvm { namespace WebAssembly { -/// This is used to indicate block signatures. -enum class ExprType : unsigned { +/// Used as immediate MachineOperands for block signatures +enum class BlockType : unsigned { + Invalid = 0x00, Void = 0x40, - I32 = 0x7F, - I64 = 0x7E, - F32 = 0x7D, - F64 = 0x7C, - V128 = 0x7B, - Exnref = 0x68, - Invalid = 0x00 + I32 = unsigned(wasm::ValType::I32), + I64 = unsigned(wasm::ValType::I64), + F32 = unsigned(wasm::ValType::F32), + F64 = unsigned(wasm::ValType::F64), + V128 = unsigned(wasm::ValType::V128), + Exnref = unsigned(wasm::ValType::EXNREF), + // Multivalue blocks (and other non-void blocks) are only emitted when the + // blocks will never be exited and are at the ends of functions (see + // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made + // to pop values off the stack, so the exact multivalue signature can always + // be inferred from the return type of the parent function in MCInstLower. + Multivalue = 0xffff, }; /// Instruction opcodes emitted via means other than CodeGen. @@ -191,6 +197,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32_S: case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64: case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64_S: + case WebAssembly::LOAD_SPLAT_v8x16: + case WebAssembly::LOAD_SPLAT_v8x16_S: return 0; case WebAssembly::LOAD16_S_I32: case WebAssembly::LOAD16_S_I32_S: @@ -240,6 +248,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32_S: case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64: case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64_S: + case WebAssembly::LOAD_SPLAT_v16x8: + case WebAssembly::LOAD_SPLAT_v16x8_S: return 1; case WebAssembly::LOAD_I32: case WebAssembly::LOAD_I32_S: @@ -295,6 +305,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { case WebAssembly::ATOMIC_NOTIFY_S: case WebAssembly::ATOMIC_WAIT_I32: case WebAssembly::ATOMIC_WAIT_I32_S: + case WebAssembly::LOAD_SPLAT_v32x4: + case WebAssembly::LOAD_SPLAT_v32x4_S: return 2; case WebAssembly::LOAD_I64: case WebAssembly::LOAD_I64_S: @@ -324,31 +336,25 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { case WebAssembly::ATOMIC_RMW_CMPXCHG_I64_S: case WebAssembly::ATOMIC_WAIT_I64: case WebAssembly::ATOMIC_WAIT_I64_S: + case WebAssembly::LOAD_SPLAT_v64x2: + case WebAssembly::LOAD_SPLAT_v64x2_S: + case WebAssembly::LOAD_EXTEND_S_v8i16: + case WebAssembly::LOAD_EXTEND_S_v8i16_S: + case WebAssembly::LOAD_EXTEND_U_v8i16: + case WebAssembly::LOAD_EXTEND_U_v8i16_S: + case WebAssembly::LOAD_EXTEND_S_v4i32: + case WebAssembly::LOAD_EXTEND_S_v4i32_S: + case WebAssembly::LOAD_EXTEND_U_v4i32: + case WebAssembly::LOAD_EXTEND_U_v4i32_S: + case WebAssembly::LOAD_EXTEND_S_v2i64: + case WebAssembly::LOAD_EXTEND_S_v2i64_S: + case WebAssembly::LOAD_EXTEND_U_v2i64: + case WebAssembly::LOAD_EXTEND_U_v2i64_S: return 3; - case WebAssembly::LOAD_v16i8: - case WebAssembly::LOAD_v16i8_S: - case WebAssembly::LOAD_v8i16: - case WebAssembly::LOAD_v8i16_S: - case WebAssembly::LOAD_v4i32: - case WebAssembly::LOAD_v4i32_S: - case WebAssembly::LOAD_v2i64: - case WebAssembly::LOAD_v2i64_S: - case WebAssembly::LOAD_v4f32: - case WebAssembly::LOAD_v4f32_S: - case WebAssembly::LOAD_v2f64: - case WebAssembly::LOAD_v2f64_S: - case WebAssembly::STORE_v16i8: - case WebAssembly::STORE_v16i8_S: - case WebAssembly::STORE_v8i16: - case WebAssembly::STORE_v8i16_S: - case WebAssembly::STORE_v4i32: - case WebAssembly::STORE_v4i32_S: - case WebAssembly::STORE_v2i64: - case WebAssembly::STORE_v2i64_S: - case WebAssembly::STORE_v4f32: - case WebAssembly::STORE_v4f32_S: - case WebAssembly::STORE_v2f64: - case WebAssembly::STORE_v2f64_S: + case WebAssembly::LOAD_V128: + case WebAssembly::LOAD_V128_S: + case WebAssembly::STORE_V128: + case WebAssembly::STORE_V128_S: return 4; default: return -1; diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp index e05efef7201b..40926201931a 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -60,39 +60,10 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) { void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } -void WebAssemblyTargetAsmStreamer::emitSignature( - const wasm::WasmSignature *Sig) { - OS << "("; - emitParamList(Sig); - OS << ") -> ("; - emitReturnList(Sig); - OS << ")"; -} - -void WebAssemblyTargetAsmStreamer::emitParamList( - const wasm::WasmSignature *Sig) { - auto &Params = Sig->Params; - for (auto &Ty : Params) { - if (&Ty != &Params[0]) - OS << ", "; - OS << WebAssembly::typeToString(Ty); - } -} - -void WebAssemblyTargetAsmStreamer::emitReturnList( - const wasm::WasmSignature *Sig) { - auto &Returns = Sig->Returns; - for (auto &Ty : Returns) { - if (&Ty != &Returns[0]) - OS << ", "; - OS << WebAssembly::typeToString(Ty); - } -} - void WebAssemblyTargetAsmStreamer::emitFunctionType(const MCSymbolWasm *Sym) { assert(Sym->isFunction()); OS << "\t.functype\t" << Sym->getName() << " "; - emitSignature(Sym->getSignature()); + OS << WebAssembly::signatureToString(Sym->getSignature()); OS << "\n"; } @@ -107,7 +78,7 @@ void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) { void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) { assert(Sym->isEvent()); OS << "\t.eventtype\t" << Sym->getName() << " "; - emitParamList(Sym->getSignature()); + OS << WebAssembly::typeListToString(Sym->getSignature()->Params); OS << "\n"; } diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h index 5ea62b179d22..0164f8e572ef 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -56,9 +56,6 @@ protected: /// This part is for ascii assembly output class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer { formatted_raw_ostream &OS; - void emitSignature(const wasm::WasmSignature *Sig); - void emitParamList(const wasm::WasmSignature *Sig); - void emitReturnList(const wasm::WasmSignature *Sig); public: WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index a1cc3e268e8f..e7a599e3e175 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -31,7 +31,7 @@ using namespace llvm; namespace { class WebAssemblyWasmObjectWriter final : public MCWasmObjectTargetWriter { public: - explicit WebAssemblyWasmObjectWriter(bool Is64Bit); + explicit WebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten); private: unsigned getRelocType(const MCValue &Target, @@ -39,8 +39,9 @@ private: }; } // end anonymous namespace -WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit) - : MCWasmObjectTargetWriter(Is64Bit) {} +WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit, + bool IsEmscripten) + : MCWasmObjectTargetWriter(Is64Bit, IsEmscripten) {} static const MCSection *getFixupSection(const MCExpr *Expr) { if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) { @@ -116,6 +117,6 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target, } std::unique_ptr<MCObjectTargetWriter> -llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit) { - return llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit); +llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten) { + return std::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit, IsEmscripten); } diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 7f9d41da3978..5d8b873ce23b 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -67,8 +67,8 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { } std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) { - unsigned RegNo = MO.getReg(); - assert(TargetRegisterInfo::isVirtualRegister(RegNo) && + Register RegNo = MO.getReg(); + assert(Register::isVirtualRegister(RegNo) && "Unlowered physical register encountered during assembly printing"); assert(!MFI->isVRegStackified(RegNo)); unsigned WAReg = MFI->getWAReg(RegNo); @@ -332,43 +332,15 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { // These represent values which are live into the function entry, so there's // no instruction to emit. break; - case WebAssembly::FALLTHROUGH_RETURN_I32: - case WebAssembly::FALLTHROUGH_RETURN_I32_S: - case WebAssembly::FALLTHROUGH_RETURN_I64: - case WebAssembly::FALLTHROUGH_RETURN_I64_S: - case WebAssembly::FALLTHROUGH_RETURN_F32: - case WebAssembly::FALLTHROUGH_RETURN_F32_S: - case WebAssembly::FALLTHROUGH_RETURN_F64: - case WebAssembly::FALLTHROUGH_RETURN_F64_S: - case WebAssembly::FALLTHROUGH_RETURN_v16i8: - case WebAssembly::FALLTHROUGH_RETURN_v16i8_S: - case WebAssembly::FALLTHROUGH_RETURN_v8i16: - case WebAssembly::FALLTHROUGH_RETURN_v8i16_S: - case WebAssembly::FALLTHROUGH_RETURN_v4i32: - case WebAssembly::FALLTHROUGH_RETURN_v4i32_S: - case WebAssembly::FALLTHROUGH_RETURN_v2i64: - case WebAssembly::FALLTHROUGH_RETURN_v2i64_S: - case WebAssembly::FALLTHROUGH_RETURN_v4f32: - case WebAssembly::FALLTHROUGH_RETURN_v4f32_S: - case WebAssembly::FALLTHROUGH_RETURN_v2f64: - case WebAssembly::FALLTHROUGH_RETURN_v2f64_S: { + case WebAssembly::FALLTHROUGH_RETURN: { // These instructions represent the implicit return at the end of a - // function body. Always pops one value off the stack. + // function body. if (isVerbose()) { - OutStreamer->AddComment("fallthrough-return-value"); + OutStreamer->AddComment("fallthrough-return"); OutStreamer->AddBlankLine(); } break; } - case WebAssembly::FALLTHROUGH_RETURN_VOID: - case WebAssembly::FALLTHROUGH_RETURN_VOID_S: - // This instruction represents the implicit return at the end of a - // function body with no return value. - if (isVerbose()) { - OutStreamer->AddComment("fallthrough-return-void"); - OutStreamer->AddBlankLine(); - } - break; case WebAssembly::COMPILER_FENCE: // This is a compiler barrier that prevents instruction reordering during // backend compilation, and should not be emitted. diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index 4c5d0192fc28..c069af9eed62 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -97,14 +97,14 @@ public: // If the smallest region containing MBB is a loop if (LoopMap.count(ML)) return LoopMap[ML].get(); - LoopMap[ML] = llvm::make_unique<ConcreteRegion<MachineLoop>>(ML); + LoopMap[ML] = std::make_unique<ConcreteRegion<MachineLoop>>(ML); return LoopMap[ML].get(); } else { // If the smallest region containing MBB is an exception if (ExceptionMap.count(WE)) return ExceptionMap[WE].get(); ExceptionMap[WE] = - llvm::make_unique<ConcreteRegion<WebAssemblyException>>(WE); + std::make_unique<ConcreteRegion<WebAssemblyException>>(WE); return ExceptionMap[WE].get(); } } @@ -317,6 +317,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, // If Next was originally ordered before MBB, and it isn't because it was // loop-rotated above the header, it's not preferred. if (Next->getNumber() < MBB->getNumber() && + (WasmDisableEHPadSort || !Next->isEHPad()) && (!R || !R->contains(Next) || R->getHeader()->getNumber() < Next->getNumber())) { Ready.push(Next); diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index e6bfc5226e2e..7e867edaaa27 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/MC/MCAsmInfo.h" using namespace llvm; @@ -315,12 +316,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { // br_on_exn 0, $__cpp_exception // rethrow // end_block - WebAssembly::ExprType ReturnType = WebAssembly::ExprType::Void; + WebAssembly::BlockType ReturnType = WebAssembly::BlockType::Void; if (IsBrOnExn) { const char *TagName = BrOnExn->getOperand(1).getSymbolName(); if (std::strcmp(TagName, "__cpp_exception") != 0) llvm_unreachable("Only C++ exception is supported"); - ReturnType = WebAssembly::ExprType::I32; + ReturnType = WebAssembly::BlockType::I32; } auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet); @@ -406,7 +407,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { auto InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet); MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos), TII.get(WebAssembly::LOOP)) - .addImm(int64_t(WebAssembly::ExprType::Void)); + .addImm(int64_t(WebAssembly::BlockType::Void)); // Decide where in Header to put the END_LOOP. BeforeSet.clear(); @@ -526,46 +527,56 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { AfterSet.insert(&MI); } - // Local expression tree should go after the TRY. - for (auto I = Header->getFirstTerminator(), E = Header->begin(); I != E; - --I) { - if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition()) - continue; - if (WebAssembly::isChild(*std::prev(I), MFI)) - AfterSet.insert(&*std::prev(I)); - else - break; - } - // If Header unwinds to MBB (= Header contains 'invoke'), the try block should // contain the call within it. So the call should go after the TRY. The // exception is when the header's terminator is a rethrow instruction, in // which case that instruction, not a call instruction before it, is gonna // throw. + MachineInstr *ThrowingCall = nullptr; if (MBB.isPredecessor(Header)) { auto TermPos = Header->getFirstTerminator(); if (TermPos == Header->end() || TermPos->getOpcode() != WebAssembly::RETHROW) { - for (const auto &MI : reverse(*Header)) { + for (auto &MI : reverse(*Header)) { if (MI.isCall()) { AfterSet.insert(&MI); + ThrowingCall = &MI; // Possibly throwing calls are usually wrapped by EH_LABEL // instructions. We don't want to split them and the call. if (MI.getIterator() != Header->begin() && - std::prev(MI.getIterator())->isEHLabel()) + std::prev(MI.getIterator())->isEHLabel()) { AfterSet.insert(&*std::prev(MI.getIterator())); + ThrowingCall = &*std::prev(MI.getIterator()); + } break; } } } } + // Local expression tree should go after the TRY. + // For BLOCK placement, we start the search from the previous instruction of a + // BB's terminator, but in TRY's case, we should start from the previous + // instruction of a call that can throw, or a EH_LABEL that precedes the call, + // because the return values of the call's previous instructions can be + // stackified and consumed by the throwing call. + auto SearchStartPt = ThrowingCall ? MachineBasicBlock::iterator(ThrowingCall) + : Header->getFirstTerminator(); + for (auto I = SearchStartPt, E = Header->begin(); I != E; --I) { + if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition()) + continue; + if (WebAssembly::isChild(*std::prev(I), MFI)) + AfterSet.insert(&*std::prev(I)); + else + break; + } + // Add the TRY. auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet); MachineInstr *Begin = BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos), TII.get(WebAssembly::TRY)) - .addImm(int64_t(WebAssembly::ExprType::Void)); + .addImm(int64_t(WebAssembly::BlockType::Void)); // Decide where in Header to put the END_TRY. BeforeSet.clear(); @@ -694,8 +705,26 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) { } } +// When MBB is split into MBB and Split, we should unstackify defs in MBB that +// have their uses in Split. +static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, + MachineBasicBlock &Split, + WebAssemblyFunctionInfo &MFI, + MachineRegisterInfo &MRI) { + for (auto &MI : Split) { + for (auto &MO : MI.explicit_uses()) { + if (!MO.isReg() || Register::isPhysicalRegister(MO.getReg())) + continue; + if (MachineInstr *Def = MRI.getUniqueVRegDef(MO.getReg())) + if (Def->getParent() == &MBB) + MFI.unstackifyVReg(MO.getReg()); + } + } +} + bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); MachineRegisterInfo &MRI = MF.getRegInfo(); // Linearizing the control flow by placing TRY / END_TRY markers can create @@ -830,7 +859,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { SmallVector<const MachineBasicBlock *, 8> EHPadStack; // Range of intructions to be wrapped in a new nested try/catch using TryRange = std::pair<MachineInstr *, MachineInstr *>; - // In original CFG, <unwind destionation BB, a vector of try ranges> + // In original CFG, <unwind destination BB, a vector of try ranges> DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges; // In new CFG, <destination to branch to, a vector of try ranges> DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> BrDestToTryRanges; @@ -936,7 +965,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { // of the function with a local.get and a rethrow instruction. if (NeedAppendixBlock) { auto *AppendixBB = getAppendixBlock(MF); - unsigned ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); + Register ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW)) .addReg(ExnReg); // These instruction ranges should branch to this appendix BB. @@ -967,7 +996,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { // ... // cont: for (auto &P : UnwindDestToTryRanges) { - NumUnwindMismatches++; + NumUnwindMismatches += P.second.size(); // This means the destination is the appendix BB, which was separately // handled above. @@ -1007,6 +1036,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { BrDest->insert(BrDest->end(), EndTry->removeFromParent()); // Take out the handler body from EH pad to the new branch destination BB. BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end()); + unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI); // Fix predecessor-successor relationship. BrDest->transferSuccessors(EHPad); EHPad->addSuccessor(BrDest); @@ -1100,7 +1130,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { MachineInstr *NestedTry = BuildMI(*MBB, *RangeBegin, RangeBegin->getDebugLoc(), TII.get(WebAssembly::TRY)) - .addImm(int64_t(WebAssembly::ExprType::Void)); + .addImm(int64_t(WebAssembly::BlockType::Void)); // Create the nested EH pad and fill instructions in. MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock(); @@ -1122,6 +1152,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { // new nested continuation BB. NestedCont->splice(NestedCont->end(), MBB, std::next(RangeEnd->getIterator()), MBB->end()); + unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI); registerTryScope(NestedTry, NestedEndTry, NestedEHPad); // Fix predecessor-successor relationship. @@ -1197,54 +1228,32 @@ getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack, /// checks for such cases and fixes up the signatures. void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); - assert(MFI.getResults().size() <= 1); if (MFI.getResults().empty()) return; - WebAssembly::ExprType RetType; - switch (MFI.getResults().front().SimpleTy) { - case MVT::i32: - RetType = WebAssembly::ExprType::I32; - break; - case MVT::i64: - RetType = WebAssembly::ExprType::I64; - break; - case MVT::f32: - RetType = WebAssembly::ExprType::F32; - break; - case MVT::f64: - RetType = WebAssembly::ExprType::F64; - break; - case MVT::v16i8: - case MVT::v8i16: - case MVT::v4i32: - case MVT::v2i64: - case MVT::v4f32: - case MVT::v2f64: - RetType = WebAssembly::ExprType::V128; - break; - case MVT::exnref: - RetType = WebAssembly::ExprType::Exnref; - break; - default: - llvm_unreachable("unexpected return type"); - } + // MCInstLower will add the proper types to multivalue signatures based on the + // function return type + WebAssembly::BlockType RetType = + MFI.getResults().size() > 1 + ? WebAssembly::BlockType::Multivalue + : WebAssembly::BlockType( + WebAssembly::toValType(MFI.getResults().front())); for (MachineBasicBlock &MBB : reverse(MF)) { for (MachineInstr &MI : reverse(MBB)) { if (MI.isPosition() || MI.isDebugInstr()) continue; - if (MI.getOpcode() == WebAssembly::END_BLOCK) { - EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType)); - continue; - } - if (MI.getOpcode() == WebAssembly::END_LOOP) { + switch (MI.getOpcode()) { + case WebAssembly::END_BLOCK: + case WebAssembly::END_LOOP: + case WebAssembly::END_TRY: EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType)); continue; + default: + // Something other than an `end`. We're done. + return; } - // Something other than an `end`. We're done. - return; } } } @@ -1280,7 +1289,9 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { } } // Fix mismatches in unwind destinations induced by linearizing the code. - fixUnwindMismatches(MF); + if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && + MF.getFunction().hasPersonalityFn()) + fixUnwindMismatches(MF); } void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index dbd62179f055..ef75bb215317 100644 --- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -168,7 +168,7 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) { static MachineInstr *findStartOfTree(MachineOperand &MO, MachineRegisterInfo &MRI, WebAssemblyFunctionInfo &MFI) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); assert(MFI.isVRegStackified(Reg)); MachineInstr *Def = MRI.getVRegDef(Reg); @@ -207,7 +207,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I++; if (!WebAssembly::isArgument(MI.getOpcode())) break; - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); assert(!MFI.isVRegStackified(Reg)); Reg2Local[Reg] = static_cast<unsigned>(MI.getOperand(1).getImm()); MI.eraseFromParent(); @@ -221,7 +221,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // drops to their defs. BitVector UseEmpty(MRI.getNumVirtRegs()); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) - UseEmpty[I] = MRI.use_empty(TargetRegisterInfo::index2VirtReg(I)); + UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I)); // Visit each instruction in the function. for (MachineBasicBlock &MBB : MF) { @@ -238,13 +238,13 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { if (WebAssembly::isTee(MI.getOpcode())) { assert(MFI.isVRegStackified(MI.getOperand(0).getReg())); assert(!MFI.isVRegStackified(MI.getOperand(1).getReg())); - unsigned OldReg = MI.getOperand(2).getReg(); + Register OldReg = MI.getOperand(2).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(OldReg); // Stackify the input if it isn't stackified yet. if (!MFI.isVRegStackified(OldReg)) { unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); - unsigned NewReg = MRI.createVirtualRegister(RC); + Register NewReg = MRI.createVirtualRegister(RC); unsigned Opc = getLocalGetOpcode(RC); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg) .addImm(LocalId); @@ -270,17 +270,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // we handle at most one def. assert(MI.getDesc().getNumDefs() <= 1); if (MI.getDesc().getNumDefs() == 1) { - unsigned OldReg = MI.getOperand(0).getReg(); + Register OldReg = MI.getOperand(0).getReg(); if (!MFI.isVRegStackified(OldReg)) { const TargetRegisterClass *RC = MRI.getRegClass(OldReg); - unsigned NewReg = MRI.createVirtualRegister(RC); + Register NewReg = MRI.createVirtualRegister(RC); auto InsertPt = std::next(MI.getIterator()); if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) { MI.eraseFromParent(); Changed = true; continue; } - if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) { + if (UseEmpty[Register::virtReg2Index(OldReg)]) { unsigned Opc = getDropOpcode(RC); MachineInstr *Drop = BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc)) @@ -310,7 +310,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { if (!MO.isReg()) continue; - unsigned OldReg = MO.getReg(); + Register OldReg = MO.getReg(); // Inline asm may have a def in the middle of the operands. Our contract // with inline asm register operands is to provide local indices as @@ -345,7 +345,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // Insert a local.get. unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg); const TargetRegisterClass *RC = MRI.getRegClass(OldReg); - unsigned NewReg = MRI.createVirtualRegister(RC); + Register NewReg = MRI.createVirtualRegister(RC); unsigned Opc = getLocalGetOpcode(RC); InsertPt = BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg) @@ -369,7 +369,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // TODO: Sort the locals for better compression. MFI.setNumLocals(CurLocal - MFI.getParams().size()); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + unsigned Reg = Register::index2VirtReg(I); auto RL = Reg2Local.find(Reg); if (RL == Reg2Local.end() || RL->second < MFI.getParams().size()) continue; diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 2552e9150833..c932f985489a 100644 --- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -1141,14 +1141,14 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) { return true; } - unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(), + Register Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(), In, I->getOperand(0)->hasOneUse()); if (!Reg) return false; MachineBasicBlock::iterator Iter = FuncInfo.InsertPt; --Iter; assert(Iter->isBitcast()); - Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI); + Iter->setPhysRegsDeadExcept(ArrayRef<Register>(), TRI); updateValueMap(I, Reg); return true; } @@ -1302,51 +1302,33 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { if (Ret->getNumOperands() == 0) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(WebAssembly::RETURN_VOID)); + TII.get(WebAssembly::RETURN)); return true; } + // TODO: support multiple return in FastISel + if (Ret->getNumOperands() > 1) + return false; + Value *RV = Ret->getOperand(0); if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy()) return false; - unsigned Opc; switch (getSimpleType(RV->getType())) { case MVT::i1: case MVT::i8: case MVT::i16: case MVT::i32: - Opc = WebAssembly::RETURN_I32; - break; case MVT::i64: - Opc = WebAssembly::RETURN_I64; - break; case MVT::f32: - Opc = WebAssembly::RETURN_F32; - break; case MVT::f64: - Opc = WebAssembly::RETURN_F64; - break; case MVT::v16i8: - Opc = WebAssembly::RETURN_v16i8; - break; case MVT::v8i16: - Opc = WebAssembly::RETURN_v8i16; - break; case MVT::v4i32: - Opc = WebAssembly::RETURN_v4i32; - break; case MVT::v2i64: - Opc = WebAssembly::RETURN_v2i64; - break; case MVT::v4f32: - Opc = WebAssembly::RETURN_v4f32; - break; case MVT::v2f64: - Opc = WebAssembly::RETURN_v2f64; - break; case MVT::exnref: - Opc = WebAssembly::RETURN_EXNREF; break; default: return false; @@ -1363,7 +1345,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { if (Reg == 0) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(WebAssembly::RETURN)) + .addReg(Reg); return true; } diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index b7fc65401fc4..6b1bbd7a2b07 100644 --- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -70,6 +70,8 @@ static void findUses(Value *V, Function &F, for (Use &U : V->uses()) { if (auto *BC = dyn_cast<BitCastOperator>(U.getUser())) findUses(BC, F, Uses, ConstantBCs); + else if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) + findUses(A, F, Uses, ConstantBCs); else if (U.get()->getType() != F.getType()) { CallSite CS(U.getUser()); if (!CS) diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp index 7d8e86d9b2c0..157ea9d525c9 100644 --- a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp @@ -56,6 +56,7 @@ #include "WebAssembly.h" #include "WebAssemblySubtarget.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" using namespace llvm; #define DEBUG_TYPE "wasm-fix-irreducible-control-flow" @@ -358,7 +359,7 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop( // Add the register which will be used to tell the jump table which block to // jump to. MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); MIB.addReg(Reg); // Compute the indices in the superheader, one for each bad block, and diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 5299068efdd4..71eeebfada4b 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -183,14 +183,14 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, bool HasBP = hasBP(MF); if (HasBP) { auto FI = MF.getInfo<WebAssemblyFunctionInfo>(); - unsigned BasePtr = MRI.createVirtualRegister(PtrRC); + Register BasePtr = MRI.createVirtualRegister(PtrRC); FI->setBasePointerVreg(BasePtr); BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), BasePtr) .addReg(SPReg); } if (StackSize) { // Subtract the frame size - unsigned OffsetReg = MRI.createVirtualRegister(PtrRC); + Register OffsetReg = MRI.createVirtualRegister(PtrRC); BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) .addImm(StackSize); BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32), @@ -199,7 +199,7 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, .addReg(OffsetReg); } if (HasBP) { - unsigned BitmaskReg = MRI.createVirtualRegister(PtrRC); + Register BitmaskReg = MRI.createVirtualRegister(PtrRC); unsigned Alignment = MFI.getMaxAlignment(); assert((1u << countTrailingZeros(Alignment)) == Alignment && "Alignment must be a power of 2"); @@ -244,7 +244,7 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF, } else if (StackSize) { const TargetRegisterClass *PtrRC = MRI.getTargetRegisterInfo()->getPointerRegClass(MF); - unsigned OffsetReg = MRI.createVirtualRegister(PtrRC); + Register OffsetReg = MRI.createVirtualRegister(PtrRC); BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) .addImm(StackSize); // In the epilog we don't need to write the result back to the SP32 physreg diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index daddd4ca16ff..fdc0f561dcd9 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -29,9 +29,9 @@ public: static const size_t RedZoneSize = 128; WebAssemblyFrameLowering() - : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16, + : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/Align(16), /*LocalAreaOffset=*/0, - /*TransientStackAlignment=*/16, + /*TransientStackAlignment=*/Align(16), /*StackRealignable=*/true) {} MachineBasicBlock::iterator diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def index 77217f16a727..13f0476eb4a5 100644 --- a/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/lib/Target/WebAssembly/WebAssemblyISD.def @@ -26,9 +26,11 @@ HANDLE_NODETYPE(WrapperPIC) HANDLE_NODETYPE(BR_IF) HANDLE_NODETYPE(BR_TABLE) HANDLE_NODETYPE(SHUFFLE) +HANDLE_NODETYPE(SWIZZLE) HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) +HANDLE_NODETYPE(LOAD_SPLAT) HANDLE_NODETYPE(THROW) HANDLE_NODETYPE(MEMORY_COPY) HANDLE_NODETYPE(MEMORY_FILL) diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index 26339eaef37d..f83a8a984ae0 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -54,6 +54,12 @@ public: ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget<WebAssemblySubtarget>(); + + // Wasm64 is not fully supported right now (and is not specified) + if (Subtarget->hasAddr64()) + report_fatal_error( + "64-bit WebAssembly (wasm64) is not currently supported"); + return SelectionDAGISel::runOnMachineFunction(MF); } @@ -88,88 +94,36 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { uint64_t SyncScopeID = cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue(); + MachineSDNode *Fence = nullptr; switch (SyncScopeID) { - case SyncScope::SingleThread: { + case SyncScope::SingleThread: // We lower a single-thread fence to a pseudo compiler barrier instruction // preventing instruction reordering. This will not be emitted in final // binary. - MachineSDNode *Fence = - CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE, - DL, // debug loc - MVT::Other, // outchain type - Node->getOperand(0) // inchain - ); - ReplaceNode(Node, Fence); - CurDAG->RemoveDeadNode(Node); - return; - } - - case SyncScope::System: { - // For non-emscripten systems, we have not decided on what we should - // traslate fences to yet. - if (!Subtarget->getTargetTriple().isOSEmscripten()) - report_fatal_error( - "ATOMIC_FENCE is not yet supported in non-emscripten OSes"); - - // Wasm does not have a fence instruction, but because all atomic - // instructions in wasm are sequentially consistent, we translate a - // fence to an idempotent atomic RMW instruction to a linear memory - // address. All atomic instructions in wasm are sequentially consistent, - // but this is to ensure a fence also prevents reordering of non-atomic - // instructions in the VM. Even though LLVM IR's fence instruction does - // not say anything about its relationship with non-atomic instructions, - // we think this is more user-friendly. - // - // While any address can work, here we use a value stored in - // __stack_pointer wasm global because there's high chance that area is - // in cache. - // - // So the selected instructions will be in the form of: - // %addr = get_global $__stack_pointer - // %0 = i32.const 0 - // i32.atomic.rmw.or %addr, %0 - SDValue StackPtrSym = CurDAG->getTargetExternalSymbol( - "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout())); - MachineSDNode *GetGlobal = - CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode - DL, // debug loc - MVT::i32, // result type - StackPtrSym // __stack_pointer symbol - ); - - SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - auto *MMO = MF.getMachineMemOperand( - MachinePointerInfo::getUnknownStack(MF), - // FIXME Volatile isn't really correct, but currently all LLVM - // atomic instructions are treated as volatiles in the backend, so - // we should be consistent. - MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad | - MachineMemOperand::MOStore, - 4, 4, AAMDNodes(), nullptr, SyncScope::System, - AtomicOrdering::SequentiallyConsistent); - MachineSDNode *Const0 = - CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero); - MachineSDNode *AtomicRMW = CurDAG->getMachineNode( - WebAssembly::ATOMIC_RMW_OR_I32, // opcode - DL, // debug loc - MVT::i32, // result type - MVT::Other, // outchain type - { - Zero, // alignment - Zero, // offset - SDValue(GetGlobal, 0), // __stack_pointer - SDValue(Const0, 0), // OR with 0 to make it idempotent - Node->getOperand(0) // inchain - }); - - CurDAG->setNodeMemRefs(AtomicRMW, {MMO}); - ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1)); - CurDAG->RemoveDeadNode(Node); - return; - } + Fence = CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE, + DL, // debug loc + MVT::Other, // outchain type + Node->getOperand(0) // inchain + ); + break; + case SyncScope::System: + // Currently wasm only supports sequentially consistent atomics, so we + // always set the order to 0 (sequentially consistent). + Fence = CurDAG->getMachineNode( + WebAssembly::ATOMIC_FENCE, + DL, // debug loc + MVT::Other, // outchain type + CurDAG->getTargetConstant(0, DL, MVT::i32), // order + Node->getOperand(0) // inchain + ); + break; default: llvm_unreachable("Unknown scope!"); } + + ReplaceNode(Node, Fence); + CurDAG->RemoveDeadNode(Node); + return; } case ISD::GlobalTLSAddress: { @@ -224,6 +178,33 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, TLSSize); return; } + case Intrinsic::wasm_tls_align: { + MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout()); + assert(PtrVT == MVT::i32 && "only wasm32 is supported for now"); + + MachineSDNode *TLSAlign = CurDAG->getMachineNode( + WebAssembly::GLOBAL_GET_I32, DL, PtrVT, + CurDAG->getTargetExternalSymbol("__tls_align", MVT::i32)); + ReplaceNode(Node, TLSAlign); + return; + } + } + break; + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + case Intrinsic::wasm_tls_base: { + MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout()); + assert(PtrVT == MVT::i32 && "only wasm32 is supported for now"); + + MachineSDNode *TLSBase = CurDAG->getMachineNode( + WebAssembly::GLOBAL_GET_I32, DL, MVT::i32, MVT::Other, + CurDAG->getTargetExternalSymbol("__tls_base", PtrVT), + Node->getOperand(0)); + ReplaceNode(Node, TLSBase); + return; + } } break; } diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4064a983099c..f06afdbcea9e 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -205,7 +205,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( for (auto T : {MVT::i8, MVT::i16, MVT::i32}) setOperationAction(ISD::SIGN_EXTEND_INREG, T, Action); } - for (auto T : MVT::integer_vector_valuetypes()) + for (auto T : MVT::integer_fixedlen_vector_valuetypes()) setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand); // Dynamic stack allocation: use the default expansion. @@ -228,7 +228,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // - Floating-point extending loads. // - Floating-point truncating stores. // - i1 extending loads. - // - extending/truncating SIMD loads/stores + // - truncating SIMD stores and most extending loads setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); for (auto T : MVT::integer_valuetypes()) @@ -237,7 +237,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( if (Subtarget->hasSIMD128()) { for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64}) { - for (auto MemT : MVT::vector_valuetypes()) { + for (auto MemT : MVT::fixedlen_vector_valuetypes()) { if (MVT(T) != MemT) { setTruncStoreAction(T, MemT, Expand); for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD}) @@ -245,6 +245,14 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( } } } + // But some vector extending loads are legal + if (Subtarget->hasUnimplementedSIMD128()) { + for (auto Ext : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) { + setLoadExtAction(Ext, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal); + } + } } // Don't do anything clever with build_pairs @@ -259,16 +267,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setMaxAtomicSizeInBitsSupported(64); - if (Subtarget->hasBulkMemory()) { - // Use memory.copy and friends over multiple loads and stores - MaxStoresPerMemcpy = 1; - MaxStoresPerMemcpyOptSize = 1; - MaxStoresPerMemmove = 1; - MaxStoresPerMemmoveOptSize = 1; - MaxStoresPerMemset = 1; - MaxStoresPerMemsetOptSize = 1; - } - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is // consistent with the f64 and f128 names. setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); @@ -337,8 +335,8 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL, bool Float64, unsigned LoweredOpcode) { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned OutReg = MI.getOperand(0).getReg(); - unsigned InReg = MI.getOperand(1).getReg(); + Register OutReg = MI.getOperand(0).getReg(); + Register InReg = MI.getOperand(1).getReg(); unsigned Abs = Float64 ? WebAssembly::ABS_F64 : WebAssembly::ABS_F32; unsigned FConst = Float64 ? WebAssembly::CONST_F64 : WebAssembly::CONST_F32; @@ -396,9 +394,9 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL, // For unsigned numbers, we have to do a separate comparison with zero. if (IsUnsigned) { Tmp1 = MRI.createVirtualRegister(MRI.getRegClass(InReg)); - unsigned SecondCmpReg = + Register SecondCmpReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); - unsigned AndReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + Register AndReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); BuildMI(BB, DL, TII.get(FConst), Tmp1) .addFPImm(cast<ConstantFP>(ConstantFP::get(Ty, 0.0))); BuildMI(BB, DL, TII.get(GE), SecondCmpReg).addReg(Tmp0).addReg(Tmp1); @@ -550,6 +548,16 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, return true; } +bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + if (!Subtarget->hasUnimplementedSIMD128()) + return false; + MVT ExtT = ExtVal.getSimpleValueType(); + MVT MemT = cast<LoadSDNode>(ExtVal->getOperand(0))->getSimpleValueType(0); + return (ExtT == MVT::v8i16 && MemT == MVT::v8i8) || + (ExtT == MVT::v4i32 && MemT == MVT::v4i16) || + (ExtT == MVT::v2i64 && MemT == MVT::v2i32); +} + EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, EVT VT) const { @@ -569,7 +577,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i32; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 4; + Info.align = Align(4); // atomic.notify instruction does not really load the memory specified with // this argument, but MachineMemOperand should either be load or store, so // we set this to a load. @@ -583,7 +591,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i32; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 4; + Info.align = Align(4); Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad; return true; case Intrinsic::wasm_atomic_wait_i64: @@ -591,7 +599,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 8; + Info.align = Align(8); Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad; return true; default: @@ -623,7 +631,8 @@ static bool callingConvSupported(CallingConv::ID CallConv) { CallConv == CallingConv::Cold || CallConv == CallingConv::PreserveMost || CallConv == CallingConv::PreserveAll || - CallConv == CallingConv::CXX_FAST_TLS; + CallConv == CallingConv::CXX_FAST_TLS || + CallConv == CallingConv::WASM_EmscriptenInvoke; } SDValue @@ -644,13 +653,36 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, if (CLI.IsPatchPoint) fail(DL, DAG, "WebAssembly doesn't support patch point yet"); - // Fail if tail calls are required but not enabled - if (!Subtarget->hasTailCall()) { - if ((CallConv == CallingConv::Fast && CLI.IsTailCall && - MF.getTarget().Options.GuaranteedTailCallOpt) || - (CLI.CS && CLI.CS.isMustTailCall())) - fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled"); - CLI.IsTailCall = false; + if (CLI.IsTailCall) { + bool MustTail = CLI.CS && CLI.CS.isMustTailCall(); + if (Subtarget->hasTailCall() && !CLI.IsVarArg) { + // Do not tail call unless caller and callee return types match + const Function &F = MF.getFunction(); + const TargetMachine &TM = getTargetMachine(); + Type *RetTy = F.getReturnType(); + SmallVector<MVT, 4> CallerRetTys; + SmallVector<MVT, 4> CalleeRetTys; + computeLegalValueVTs(F, TM, RetTy, CallerRetTys); + computeLegalValueVTs(F, TM, CLI.RetTy, CalleeRetTys); + bool TypesMatch = CallerRetTys.size() == CalleeRetTys.size() && + std::equal(CallerRetTys.begin(), CallerRetTys.end(), + CalleeRetTys.begin()); + if (!TypesMatch) { + // musttail in this case would be an LLVM IR validation failure + assert(!MustTail); + CLI.IsTailCall = false; + } + } else { + CLI.IsTailCall = false; + if (MustTail) { + if (CLI.IsVarArg) { + // The return would pop the argument buffer + fail(DL, DAG, "WebAssembly does not support varargs tail calls"); + } else { + fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled"); + } + } + } } SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; @@ -659,6 +691,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + + // The generic code may have added an sret argument. If we're lowering an + // invoke function, the ABI requires that the function pointer be the first + // argument, so we may have to swap the arguments. + if (CallConv == CallingConv::WASM_EmscriptenInvoke && Outs.size() >= 2 && + Outs[0].Flags.isSRet()) { + std::swap(Outs[0], Outs[1]); + std::swap(OutVals[0], OutVals[1]); + } + unsigned NumFixedArgs = 0; for (unsigned I = 0; I < Outs.size(); ++I) { const ISD::OutputArg &Out = Outs[I]; @@ -810,8 +852,8 @@ bool WebAssemblyTargetLowering::CanLowerReturn( CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext & /*Context*/) const { - // WebAssembly can't currently handle returning tuples. - return Outs.size() <= 1; + // WebAssembly can only handle returning tuples with multivalue enabled + return Subtarget->hasMultivalue() || Outs.size() <= 1; } SDValue WebAssemblyTargetLowering::LowerReturn( @@ -819,7 +861,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn( const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { - assert(Outs.size() <= 1 && "WebAssembly can only return up to one value"); + assert((Subtarget->hasMultivalue() || Outs.size() <= 1) && + "MVP WebAssembly can only return up to one value"); if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); @@ -881,7 +924,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( // the buffer is passed as an argument. if (IsVarArg) { MVT PtrVT = getPointerTy(MF.getDataLayout()); - unsigned VarargVreg = + Register VarargVreg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT)); MFI->setVarargBufferVreg(VarargVreg); Chain = DAG.getCopyToReg( @@ -1022,8 +1065,9 @@ SDValue WebAssemblyTargetLowering::LowerRETURNADDR(SDValue Op, return SDValue(); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + MakeLibCallOptions CallOptions; return makeLibCall(DAG, RTLIB::RETURN_ADDRESS, Op.getValueType(), - {DAG.getConstant(Depth, DL, MVT::i32)}, false, DL) + {DAG.getConstant(Depth, DL, MVT::i32)}, CallOptions, DL) .first; } @@ -1037,7 +1081,7 @@ SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op, DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); - unsigned FP = + Register FP = Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction()); return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT); } @@ -1249,68 +1293,116 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, const EVT VecT = Op.getValueType(); const EVT LaneT = Op.getOperand(0).getValueType(); const size_t Lanes = Op.getNumOperands(); + bool CanSwizzle = Subtarget->hasUnimplementedSIMD128() && VecT == MVT::v16i8; + + // BUILD_VECTORs are lowered to the instruction that initializes the highest + // possible number of lanes at once followed by a sequence of replace_lane + // instructions to individually initialize any remaining lanes. + + // TODO: Tune this. For example, lanewise swizzling is very expensive, so + // swizzled lanes should be given greater weight. + + // TODO: Investigate building vectors by shuffling together vectors built by + // separately specialized means. + auto IsConstant = [](const SDValue &V) { return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP; }; - // Find the most common operand, which is approximately the best to splat - using Entry = std::pair<SDValue, size_t>; - SmallVector<Entry, 16> ValueCounts; - size_t NumConst = 0, NumDynamic = 0; - for (const SDValue &Lane : Op->op_values()) { - if (Lane.isUndef()) { - continue; - } else if (IsConstant(Lane)) { - NumConst++; - } else { - NumDynamic++; - } - auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(), - [&Lane](Entry A) { return A.first == Lane; }); - if (CountIt == ValueCounts.end()) { - ValueCounts.emplace_back(Lane, 1); + // Returns the source vector and index vector pair if they exist. Checks for: + // (extract_vector_elt + // $src, + // (sign_extend_inreg (extract_vector_elt $indices, $i)) + // ) + auto GetSwizzleSrcs = [](size_t I, const SDValue &Lane) { + auto Bail = std::make_pair(SDValue(), SDValue()); + if (Lane->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return Bail; + const SDValue &SwizzleSrc = Lane->getOperand(0); + const SDValue &IndexExt = Lane->getOperand(1); + if (IndexExt->getOpcode() != ISD::SIGN_EXTEND_INREG) + return Bail; + const SDValue &Index = IndexExt->getOperand(0); + if (Index->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return Bail; + const SDValue &SwizzleIndices = Index->getOperand(0); + if (SwizzleSrc.getValueType() != MVT::v16i8 || + SwizzleIndices.getValueType() != MVT::v16i8 || + Index->getOperand(1)->getOpcode() != ISD::Constant || + Index->getConstantOperandVal(1) != I) + return Bail; + return std::make_pair(SwizzleSrc, SwizzleIndices); + }; + + using ValueEntry = std::pair<SDValue, size_t>; + SmallVector<ValueEntry, 16> SplatValueCounts; + + using SwizzleEntry = std::pair<std::pair<SDValue, SDValue>, size_t>; + SmallVector<SwizzleEntry, 16> SwizzleCounts; + + auto AddCount = [](auto &Counts, const auto &Val) { + auto CountIt = std::find_if(Counts.begin(), Counts.end(), + [&Val](auto E) { return E.first == Val; }); + if (CountIt == Counts.end()) { + Counts.emplace_back(Val, 1); } else { CountIt->second++; } + }; + + auto GetMostCommon = [](auto &Counts) { + auto CommonIt = + std::max_element(Counts.begin(), Counts.end(), + [](auto A, auto B) { return A.second < B.second; }); + assert(CommonIt != Counts.end() && "Unexpected all-undef build_vector"); + return *CommonIt; + }; + + size_t NumConstantLanes = 0; + + // Count eligible lanes for each type of vector creation op + for (size_t I = 0; I < Lanes; ++I) { + const SDValue &Lane = Op->getOperand(I); + if (Lane.isUndef()) + continue; + + AddCount(SplatValueCounts, Lane); + + if (IsConstant(Lane)) { + NumConstantLanes++; + } else if (CanSwizzle) { + auto SwizzleSrcs = GetSwizzleSrcs(I, Lane); + if (SwizzleSrcs.first) + AddCount(SwizzleCounts, SwizzleSrcs); + } } - auto CommonIt = - std::max_element(ValueCounts.begin(), ValueCounts.end(), - [](Entry A, Entry B) { return A.second < B.second; }); - assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector"); - SDValue SplatValue = CommonIt->first; - size_t NumCommon = CommonIt->second; - // If v128.const is available, consider using it instead of a splat + SDValue SplatValue; + size_t NumSplatLanes; + std::tie(SplatValue, NumSplatLanes) = GetMostCommon(SplatValueCounts); + + SDValue SwizzleSrc; + SDValue SwizzleIndices; + size_t NumSwizzleLanes = 0; + if (SwizzleCounts.size()) + std::forward_as_tuple(std::tie(SwizzleSrc, SwizzleIndices), + NumSwizzleLanes) = GetMostCommon(SwizzleCounts); + + // Predicate returning true if the lane is properly initialized by the + // original instruction + std::function<bool(size_t, const SDValue &)> IsLaneConstructed; + SDValue Result; if (Subtarget->hasUnimplementedSIMD128()) { - // {i32,i64,f32,f64}.const opcode, and value - const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes); - // SIMD prefix and opcode - const size_t SplatBytes = 2; - const size_t SplatConstBytes = SplatBytes + ConstBytes; - // SIMD prefix, opcode, and lane index - const size_t ReplaceBytes = 3; - const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes; - // SIMD prefix, v128.const opcode, and 128-bit value - const size_t VecConstBytes = 18; - // Initial v128.const and a replace_lane for each non-const operand - const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes; - // Initial splat and all necessary replace_lanes - const size_t SplatInitBytes = - IsConstant(SplatValue) - // Initial constant splat - ? (SplatConstBytes + - // Constant replace_lanes - (NumConst - NumCommon) * ReplaceConstBytes + - // Dynamic replace_lanes - (NumDynamic * ReplaceBytes)) - // Initial dynamic splat - : (SplatBytes + - // Constant replace_lanes - (NumConst * ReplaceConstBytes) + - // Dynamic replace_lanes - (NumDynamic - NumCommon) * ReplaceBytes); - if (ConstInitBytes < SplatInitBytes) { - // Create build_vector that will lower to initial v128.const + // Prefer swizzles over vector consts over splats + if (NumSwizzleLanes >= NumSplatLanes && + NumSwizzleLanes >= NumConstantLanes) { + Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc, + SwizzleIndices); + auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices); + IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) { + return Swizzled == GetSwizzleSrcs(I, Lane); + }; + } else if (NumConstantLanes >= NumSplatLanes) { SmallVector<SDValue, 16> ConstLanes; for (const SDValue &Lane : Op->op_values()) { if (IsConstant(Lane)) { @@ -1321,26 +1413,35 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, ConstLanes.push_back(DAG.getConstant(0, DL, LaneT)); } } - SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes); - // Add replace_lane instructions for non-const lanes - for (size_t I = 0; I < Lanes; ++I) { - const SDValue &Lane = Op->getOperand(I); - if (!Lane.isUndef() && !IsConstant(Lane)) - Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane, - DAG.getConstant(I, DL, MVT::i32)); - } - return Result; + Result = DAG.getBuildVector(VecT, DL, ConstLanes); + IsLaneConstructed = [&](size_t _, const SDValue &Lane) { + return IsConstant(Lane); + }; + } + } + if (!Result) { + // Use a splat, but possibly a load_splat + LoadSDNode *SplattedLoad; + if (Subtarget->hasUnimplementedSIMD128() && + (SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) && + SplattedLoad->getMemoryVT() == VecT.getVectorElementType()) { + Result = DAG.getNode(WebAssemblyISD::LOAD_SPLAT, DL, VecT, SplatValue); + } else { + Result = DAG.getSplatBuildVector(VecT, DL, SplatValue); } + IsLaneConstructed = [&](size_t _, const SDValue &Lane) { + return Lane == SplatValue; + }; } - // Use a splat for the initial vector - SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue); - // Add replace_lane instructions for other values + + // Add replace_lane instructions for any unhandled values for (size_t I = 0; I < Lanes; ++I) { const SDValue &Lane = Op->getOperand(I); - if (Lane != SplatValue) + if (!Lane.isUndef() && !IsLaneConstructed(I, Lane)) Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane, DAG.getConstant(I, DL, MVT::i32)); } + return Result; } @@ -1415,11 +1516,6 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op, // Only manually lower vector shifts assert(Op.getSimpleValueType().isVector()); - // Expand all vector shifts until V8 fixes its implementation - // TODO: remove this once V8 is fixed - if (!Subtarget->hasUnimplementedSIMD128()) - return unrollVectorShift(Op, DAG); - // Unroll non-splat vector shifts BuildVectorSDNode *ShiftVec; SDValue SplatVal; diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h index b3c7f3defd5f..a53e24a05542 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -63,7 +63,7 @@ private: MachineMemOperand::Flags Flags, bool *Fast) const override; bool isIntDivCheap(EVT VT, AttributeList Attr) const override; - + bool isVectorLoadExtDesirable(SDValue ExtVal) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index e85aa57efc42..a9a99d38f9f1 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -71,12 +71,6 @@ class NotifyPatImmOff<PatFrag operand> : def : NotifyPatImmOff<regPlusImm>; def : NotifyPatImmOff<or_is_add>; -def NotifyPatGlobalAddr : - Pat<(i32 (int_wasm_atomic_notify (regPlusGA I32:$addr, - (WebAssemblywrapper tglobaladdr:$off)), - I32:$count)), - (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>; - // Select notifys with just a constant offset. def NotifyPatOffsetOnly : Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)), @@ -105,13 +99,6 @@ def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add, ATOMIC_WAIT_I32>; def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm, ATOMIC_WAIT_I64>; def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add, ATOMIC_WAIT_I64>; -class WaitPatGlobalAddr<ValueType ty, Intrinsic kind, NI inst> : - Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), - ty:$exp, I64:$timeout)), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>; -def : WaitPatGlobalAddr<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>; -def : WaitPatGlobalAddr<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>; - // Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset. class WaitPatOffsetOnly<ValueType ty, Intrinsic kind, NI inst> : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)), @@ -127,6 +114,19 @@ def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>; } // Predicates = [HasAtomics] //===----------------------------------------------------------------------===// +// Atomic fences +//===----------------------------------------------------------------------===// + +// A compiler fence instruction that prevents reordering of instructions. +let Defs = [ARGUMENTS] in { +let isPseudo = 1, hasSideEffects = 1 in +defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">; +let hasSideEffects = 1 in +defm ATOMIC_FENCE : ATOMIC_NRI<(outs), (ins i8imm:$flags), [], "atomic.fence", + 0x03>; +} // Defs = [ARGUMENTS] + +//===----------------------------------------------------------------------===// // Atomic loads //===----------------------------------------------------------------------===// @@ -151,9 +151,6 @@ def : LoadPatImmOff<i64, atomic_load_64, regPlusImm, ATOMIC_LOAD_I64>; def : LoadPatImmOff<i32, atomic_load_32, or_is_add, ATOMIC_LOAD_I32>; def : LoadPatImmOff<i64, atomic_load_64, or_is_add, ATOMIC_LOAD_I64>; -def : LoadPatGlobalAddr<i32, atomic_load_32, ATOMIC_LOAD_I32>; -def : LoadPatGlobalAddr<i64, atomic_load_64, ATOMIC_LOAD_I64>; - // Select loads with just a constant offset. def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>; def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>; @@ -244,16 +241,6 @@ def : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>; def : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>; // No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64 -def : LoadPatGlobalAddr<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>; -def : LoadPatGlobalAddr<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>; -def : LoadPatGlobalAddr<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>; -def : LoadPatGlobalAddr<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>; -def : LoadPatGlobalAddr<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>; -def : LoadPatGlobalAddr<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>; -def : LoadPatGlobalAddr<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>; -def : LoadPatGlobalAddr<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>; -def : LoadPatGlobalAddr<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>; - // Extending loads with just a constant offset def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>; def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>; @@ -313,13 +300,6 @@ def : AStorePatImmOff<i64, atomic_store_64, regPlusImm, ATOMIC_STORE_I64>; def : AStorePatImmOff<i32, atomic_store_32, or_is_add, ATOMIC_STORE_I32>; def : AStorePatImmOff<i64, atomic_store_64, or_is_add, ATOMIC_STORE_I64>; -class AStorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> : - Pat<(kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), - ty:$val), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>; -def : AStorePatGlobalAddr<i32, atomic_store_32, ATOMIC_STORE_I32>; -def : AStorePatGlobalAddr<i64, atomic_store_64, ATOMIC_STORE_I64>; - // Select stores with just a constant offset. class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> : Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>; @@ -374,12 +354,6 @@ def : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, ATOMIC_STORE8_I64>; def : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add, ATOMIC_STORE16_I64>; def : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add, ATOMIC_STORE32_I64>; -def : AStorePatGlobalAddr<i32, atomic_store_8, ATOMIC_STORE8_I32>; -def : AStorePatGlobalAddr<i32, atomic_store_16, ATOMIC_STORE16_I32>; -def : AStorePatGlobalAddr<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>; -def : AStorePatGlobalAddr<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>; -def : AStorePatGlobalAddr<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>; - // Truncating stores with just a constant offset def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>; def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>; @@ -500,11 +474,6 @@ class BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)), (inst 0, imm:$off, I32:$addr, ty:$val)>; -class BinRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> : - Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), - ty:$val)), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>; - // Select binary RMWs with just a constant offset. class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> : Pat<(ty (kind imm:$off, ty:$val)), @@ -525,9 +494,6 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32, def : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>; def : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>; - def : BinRMWPatGlobalAddr<i32, rmw_32, inst_32>; - def : BinRMWPatGlobalAddr<i64, rmw_64, inst_64>; - def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>; def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>; @@ -622,17 +588,6 @@ multiclass BinRMWTruncExtPattern< def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>; def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>; - def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>; - def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>; - def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>; - def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>; - def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>; - - def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>; - def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>; - def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>; - def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>; - // Truncating-extending binary RMWs with just a constant offset def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>; def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>; @@ -732,11 +687,6 @@ class TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)), (inst 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>; -class TerRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> : - Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)), - ty:$exp, ty:$new)), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, ty:$new)>; - // Select ternary RMWs with just a constant offset. class TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> : Pat<(ty (kind imm:$off, ty:$exp, ty:$new)), @@ -757,9 +707,6 @@ multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32, def : TerRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>; def : TerRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>; - def : TerRMWPatGlobalAddr<i32, rmw_32, inst_32>; - def : TerRMWPatGlobalAddr<i64, rmw_64, inst_64>; - def : TerRMWPatOffsetOnly<i32, rmw_32, inst_32>; def : TerRMWPatOffsetOnly<i64, rmw_64, inst_64>; @@ -846,17 +793,6 @@ multiclass TerRMWTruncExtPattern< def : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>; def : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>; - def : TerRMWPatGlobalAddr<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>; - def : TerRMWPatGlobalAddr<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>; - def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>; - def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>; - def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>; - - def : TerRMWPatGlobalAddr<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>; - def : TerRMWPatGlobalAddr<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>; - def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>; - def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>; - // Truncating-extending ternary RMWs with just a constant offset def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>; def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>; @@ -887,13 +823,3 @@ defm : TerRMWTruncExtPattern< ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32, ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64, ATOMIC_RMW32_U_CMPXCHG_I64>; - -//===----------------------------------------------------------------------===// -// Atomic fences -//===----------------------------------------------------------------------===// - -// A compiler fence instruction that prevents reordering of instructions. -let Defs = [ARGUMENTS] in { -let isPseudo = 1, hasSideEffects = 1 in -defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">; -} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td index f4352e3d12ec..05735cf6d31f 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td @@ -39,7 +39,7 @@ defm MEMORY_INIT : (ins i32imm_op:$seg, i32imm_op:$idx, I32:$dest, I32:$offset, I32:$size), (outs), (ins i32imm_op:$seg, i32imm_op:$idx), - [(int_wasm_memory_init (i32 imm:$seg), (i32 imm:$idx), I32:$dest, + [(int_wasm_memory_init (i32 timm:$seg), (i32 timm:$idx), I32:$dest, I32:$offset, I32:$size )], "memory.init\t$seg, $idx, $dest, $offset, $size", @@ -48,7 +48,7 @@ defm MEMORY_INIT : let hasSideEffects = 1 in defm DATA_DROP : BULK_I<(outs), (ins i32imm_op:$seg), (outs), (ins i32imm_op:$seg), - [(int_wasm_data_drop (i32 imm:$seg))], + [(int_wasm_data_drop (i32 timm:$seg))], "data.drop\t$seg", "data.drop\t$seg", 0x09>; let mayLoad = 1, mayStore = 1 in diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 1870c5bc34b0..1afc9a8790dc 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -84,49 +84,19 @@ let isTerminator = 1, isBarrier = 1 in defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>; } // Uses = [VALUE_STACK], Defs = [VALUE_STACK] -multiclass RETURN<WebAssemblyRegClass vt> { - defm RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins), - [(WebAssemblyreturn vt:$val)], - "return \t$val", "return", 0x0f>; - // Equivalent to RETURN_#vt, for use at the end of a function when wasm - // semantics return by falling off the end of the block. - let isCodeGenOnly = 1 in - defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins), []>; -} - -multiclass SIMD_RETURN<ValueType vt> { - defm RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins), - [(WebAssemblyreturn (vt V128:$val))], - "return \t$val", "return", 0x0f>, - Requires<[HasSIMD128]>; - // Equivalent to RETURN_#vt, for use at the end of a function when wasm - // semantics return by falling off the end of the block. - let isCodeGenOnly = 1 in - defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins), - []>, - Requires<[HasSIMD128]>; -} let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { let isReturn = 1 in { - defm "": RETURN<I32>; - defm "": RETURN<I64>; - defm "": RETURN<F32>; - defm "": RETURN<F64>; - defm "": RETURN<EXNREF>; - defm "": SIMD_RETURN<v16i8>; - defm "": SIMD_RETURN<v8i16>; - defm "": SIMD_RETURN<v4i32>; - defm "": SIMD_RETURN<v2i64>; - defm "": SIMD_RETURN<v4f32>; - defm "": SIMD_RETURN<v2f64>; - defm RETURN_VOID : NRI<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>; +defm RETURN : I<(outs), (ins variable_ops), (outs), (ins), + [(WebAssemblyreturn)], + "return", "return", 0x0f>; +// Equivalent to RETURN, for use at the end of a function when wasm +// semantics return by falling off the end of the block. +let isCodeGenOnly = 1 in +defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>; - // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt. - let isCodeGenOnly = 1 in - defm FALLTHROUGH_RETURN_VOID : NRI<(outs), (ins), []>; } // isReturn = 1 defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td index 661fee2715ba..f3d9c5d5032c 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -171,6 +171,23 @@ defm I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins), 0xb1>; } // hasSideEffects = 1 +def : Pat<(int_wasm_trunc_signed F32:$src), + (I32_TRUNC_S_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_unsigned F32:$src), + (I32_TRUNC_U_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_signed F64:$src), + (I32_TRUNC_S_F64 F64:$src)>; +def : Pat<(int_wasm_trunc_unsigned F64:$src), + (I32_TRUNC_U_F64 F64:$src)>; +def : Pat<(int_wasm_trunc_signed F32:$src), + (I64_TRUNC_S_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_unsigned F32:$src), + (I64_TRUNC_U_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_signed F64:$src), + (I64_TRUNC_S_F64 F64:$src)>; +def : Pat<(int_wasm_trunc_unsigned F64:$src), + (I64_TRUNC_U_F64 F64:$src)>; + defm F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins), [(set F32:$dst, (sint_to_fp I32:$src))], "f32.convert_i32_s\t$dst, $src", "f32.convert_i32_s", diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index a86c9af28f0d..8e8126c90e72 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -38,7 +38,7 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) RI(STI.getTargetTriple()) {} bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI, AliasAnalysis *AA) const { + const MachineInstr &MI, AAResults *AA) const { switch (MI.getOpcode()) { case WebAssembly::CONST_I32: case WebAssembly::CONST_I64: @@ -60,7 +60,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // exist. However we need to handle both here. auto &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(DestReg) + Register::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg); diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index df1051b4f42c..fe6211663c31 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -43,7 +43,7 @@ public: const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; } bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const override; + AAResults *AA) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 73ddbe85d551..044901481381 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -106,7 +106,8 @@ def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE", def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT", SDT_WebAssemblyArgument>; def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN", - SDT_WebAssemblyReturn, [SDNPHasChain]>; + SDT_WebAssemblyReturn, + [SDNPHasChain, SDNPVariadic]>; def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper", SDT_WebAssemblyWrapper>; def WebAssemblywrapperPIC : SDNode<"WebAssemblyISD::WrapperPIC", diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 6916b165f970..eba9b80d3286 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -37,16 +37,6 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ return (~Known0.Zero & ~Known1.Zero) == 0; }]>; -// GlobalAddresses are conceptually unsigned values, so we can also fold them -// into immediate values as long as the add is 'nuw'. -// TODO: We'd like to also match GA offsets but there are cases where the -// register can have a negative value. Find out what more we can do. -def regPlusGA : PatFrag<(ops node:$addr, node:$off), - (add node:$addr, node:$off), - [{ - return N->getFlags().hasNoUnsignedWrap(); -}]>; - // We don't need a regPlusES because external symbols never have constant // offsets folded into them, so we can just use add. @@ -93,15 +83,6 @@ def : LoadPatImmOff<i64, load, or_is_add, LOAD_I64>; def : LoadPatImmOff<f32, load, or_is_add, LOAD_F32>; def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>; -class LoadPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> : - Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))), - (inst 0, tglobaladdr:$off, I32:$addr)>, Requires<[IsNotPIC]>; - -def : LoadPatGlobalAddr<i32, load, LOAD_I32>; -def : LoadPatGlobalAddr<i64, load, LOAD_I64>; -def : LoadPatGlobalAddr<f32, load, LOAD_F32>; -def : LoadPatGlobalAddr<f64, load, LOAD_F64>; - // Select loads with just a constant offset. class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> : Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>; @@ -167,18 +148,6 @@ def : LoadPatImmOff<i64, zextloadi16, or_is_add, LOAD16_U_I64>; def : LoadPatImmOff<i64, sextloadi32, or_is_add, LOAD32_S_I64>; def : LoadPatImmOff<i64, zextloadi32, or_is_add, LOAD32_U_I64>; -def : LoadPatGlobalAddr<i32, sextloadi8, LOAD8_S_I32>; -def : LoadPatGlobalAddr<i32, zextloadi8, LOAD8_U_I32>; -def : LoadPatGlobalAddr<i32, sextloadi16, LOAD16_S_I32>; -def : LoadPatGlobalAddr<i32, zextloadi8, LOAD16_U_I32>; - -def : LoadPatGlobalAddr<i64, sextloadi8, LOAD8_S_I64>; -def : LoadPatGlobalAddr<i64, zextloadi8, LOAD8_U_I64>; -def : LoadPatGlobalAddr<i64, sextloadi16, LOAD16_S_I64>; -def : LoadPatGlobalAddr<i64, zextloadi16, LOAD16_U_I64>; -def : LoadPatGlobalAddr<i64, sextloadi32, LOAD32_S_I64>; -def : LoadPatGlobalAddr<i64, zextloadi32, LOAD32_U_I64>; - // Select extending loads with just a constant offset. def : LoadPatOffsetOnly<i32, sextloadi8, LOAD8_S_I32>; def : LoadPatOffsetOnly<i32, zextloadi8, LOAD8_U_I32>; @@ -224,11 +193,6 @@ def : LoadPatImmOff<i32, extloadi16, or_is_add, LOAD16_U_I32>; def : LoadPatImmOff<i64, extloadi8, or_is_add, LOAD8_U_I64>; def : LoadPatImmOff<i64, extloadi16, or_is_add, LOAD16_U_I64>; def : LoadPatImmOff<i64, extloadi32, or_is_add, LOAD32_U_I64>; -def : LoadPatGlobalAddr<i32, extloadi8, LOAD8_U_I32>; -def : LoadPatGlobalAddr<i32, extloadi16, LOAD16_U_I32>; -def : LoadPatGlobalAddr<i64, extloadi8, LOAD8_U_I64>; -def : LoadPatGlobalAddr<i64, extloadi16, LOAD16_U_I64>; -def : LoadPatGlobalAddr<i64, extloadi32, LOAD32_U_I64>; // Select "don't care" extending loads with just a constant offset. def : LoadPatOffsetOnly<i32, extloadi8, LOAD8_U_I32>; @@ -282,15 +246,6 @@ def : StorePatImmOff<i64, store, or_is_add, STORE_I64>; def : StorePatImmOff<f32, store, or_is_add, STORE_F32>; def : StorePatImmOff<f64, store, or_is_add, STORE_F64>; -class StorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> : - Pat<(kind ty:$val, - (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))), - (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>, Requires<[IsNotPIC]>; -def : StorePatGlobalAddr<i32, store, STORE_I32>; -def : StorePatGlobalAddr<i64, store, STORE_I64>; -def : StorePatGlobalAddr<f32, store, STORE_F32>; -def : StorePatGlobalAddr<f64, store, STORE_F64>; - // Select stores with just a constant offset. class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> : Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>; @@ -333,12 +288,6 @@ def : StorePatImmOff<i64, truncstorei8, or_is_add, STORE8_I64>; def : StorePatImmOff<i64, truncstorei16, or_is_add, STORE16_I64>; def : StorePatImmOff<i64, truncstorei32, or_is_add, STORE32_I64>; -def : StorePatGlobalAddr<i32, truncstorei8, STORE8_I32>; -def : StorePatGlobalAddr<i32, truncstorei16, STORE16_I32>; -def : StorePatGlobalAddr<i64, truncstorei8, STORE8_I64>; -def : StorePatGlobalAddr<i64, truncstorei16, STORE16_I64>; -def : StorePatGlobalAddr<i64, truncstorei32, STORE32_I64>; - // Select truncating stores with just a constant offset. def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>; def : StorePatOffsetOnly<i32, truncstorei16, STORE16_I32>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index dd8930f079b0..fc5d73dac52e 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -40,47 +40,124 @@ def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">; //===----------------------------------------------------------------------===// // Load: v128.load -multiclass SIMDLoad<ValueType vec_t> { - let mayLoad = 1, UseNamedOperandTable = 1 in - defm LOAD_#vec_t : +let mayLoad = 1, UseNamedOperandTable = 1 in +defm LOAD_V128 : + SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "v128.load\t$dst, ${off}(${addr})$p2align", + "v128.load\t$off$p2align", 0>; + +// Def load and store patterns from WebAssemblyInstrMemory.td for vector types +foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { +def : LoadPatNoOffset<vec_t, load, LOAD_V128>; +def : LoadPatImmOff<vec_t, load, regPlusImm, LOAD_V128>; +def : LoadPatImmOff<vec_t, load, or_is_add, LOAD_V128>; +def : LoadPatOffsetOnly<vec_t, load, LOAD_V128>; +def : LoadPatGlobalAddrOffOnly<vec_t, load, LOAD_V128>; +} + +// vNxM.load_splat +multiclass SIMDLoadSplat<string vec, bits<32> simdop> { + let mayLoad = 1, UseNamedOperandTable = 1, + Predicates = [HasUnimplementedSIMD128] in + defm LOAD_SPLAT_#vec : SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "v128.load\t$dst, ${off}(${addr})$p2align", - "v128.load\t$off$p2align", 0>; + vec#".load_splat\t$dst, ${off}(${addr})$p2align", + vec#".load_splat\t$off$p2align", simdop>; } -foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { -defm "" : SIMDLoad<vec_t>; +defm "" : SIMDLoadSplat<"v8x16", 194>; +defm "" : SIMDLoadSplat<"v16x8", 195>; +defm "" : SIMDLoadSplat<"v32x4", 196>; +defm "" : SIMDLoadSplat<"v64x2", 197>; -// Def load and store patterns from WebAssemblyInstrMemory.td for vector types -def : LoadPatNoOffset<vec_t, load, !cast<NI>("LOAD_"#vec_t)>; -def : LoadPatImmOff<vec_t, load, regPlusImm, !cast<NI>("LOAD_"#vec_t)>; -def : LoadPatImmOff<vec_t, load, or_is_add, !cast<NI>("LOAD_"#vec_t)>; -def : LoadPatGlobalAddr<vec_t, load, !cast<NI>("LOAD_"#vec_t)>; -def : LoadPatOffsetOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>; -def : LoadPatGlobalAddrOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>; +def wasm_load_splat_t : SDTypeProfile<1, 1, []>; +def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t>; + +foreach args = [["v16i8", "i32", "extloadi8"], ["v8i16", "i32", "extloadi16"], + ["v4i32", "i32", "load"], ["v2i64", "i64", "load"], + ["v4f32", "f32", "load"], ["v2f64", "f64", "load"]] in +def load_splat_#args[0] : + PatFrag<(ops node:$addr), (wasm_load_splat + (!cast<ValueType>(args[1]) (!cast<PatFrag>(args[2]) node:$addr)))>; + +let Predicates = [HasUnimplementedSIMD128] in +foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"], + ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in { +def : LoadPatNoOffset<!cast<ValueType>(args[0]), + !cast<PatFrag>("load_splat_"#args[0]), + !cast<NI>("LOAD_SPLAT_"#args[1])>; +def : LoadPatImmOff<!cast<ValueType>(args[0]), + !cast<PatFrag>("load_splat_"#args[0]), + regPlusImm, + !cast<NI>("LOAD_SPLAT_"#args[1])>; +def : LoadPatImmOff<!cast<ValueType>(args[0]), + !cast<PatFrag>("load_splat_"#args[0]), + or_is_add, + !cast<NI>("LOAD_SPLAT_"#args[1])>; +def : LoadPatOffsetOnly<!cast<ValueType>(args[0]), + !cast<PatFrag>("load_splat_"#args[0]), + !cast<NI>("LOAD_SPLAT_"#args[1])>; +def : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]), + !cast<PatFrag>("load_splat_"#args[0]), + !cast<NI>("LOAD_SPLAT_"#args[1])>; } -// Store: v128.store -multiclass SIMDStore<ValueType vec_t> { - let mayStore = 1, UseNamedOperandTable = 1 in - defm STORE_#vec_t : - SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec), +// Load and extend +multiclass SIMDLoadExtend<ValueType vec_t, string name, bits<32> simdop> { + let mayLoad = 1, UseNamedOperandTable = 1, + Predicates = [HasUnimplementedSIMD128] in { + defm LOAD_EXTEND_S_#vec_t : + SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + name#"_s\t$dst, ${off}(${addr})$p2align", + name#"_s\t$off$p2align", simdop>; + defm LOAD_EXTEND_U_#vec_t : + SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "v128.store\t${off}(${addr})$p2align, $vec", - "v128.store\t$off$p2align", 1>; + name#"_u\t$dst, ${off}(${addr})$p2align", + name#"_u\t$off$p2align", !add(simdop, 1)>; + } } -foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { -defm "" : SIMDStore<vec_t>; +defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 210>; +defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 212>; +defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 214>; + +let Predicates = [HasUnimplementedSIMD128] in +foreach types = [[v8i16, i8], [v4i32, i16], [v2i64, i32]] in +foreach exts = [["sextloadv", "_S"], + ["zextloadv", "_U"], + ["extloadv", "_U"]] in { +def : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]), + !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>; +def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm, + !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>; +def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add, + !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>; +def : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]), + !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>; +def : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]), + !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>; +} + + +// Store: v128.store +let mayStore = 1, UseNamedOperandTable = 1 in +defm STORE_V128 : + SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + "v128.store\t${off}(${addr})$p2align, $vec", + "v128.store\t$off$p2align", 1>; +foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { // Def load and store patterns from WebAssemblyInstrMemory.td for vector types -def : StorePatNoOffset<vec_t, store, !cast<NI>("STORE_"#vec_t)>; -def : StorePatImmOff<vec_t, store, regPlusImm, !cast<NI>("STORE_"#vec_t)>; -def : StorePatImmOff<vec_t, store, or_is_add, !cast<NI>("STORE_"#vec_t)>; -def : StorePatGlobalAddr<vec_t, store, !cast<NI>("STORE_"#vec_t)>; -def : StorePatOffsetOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>; -def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>; +def : StorePatNoOffset<vec_t, store, STORE_V128>; +def : StorePatImmOff<vec_t, store, regPlusImm, STORE_V128>; +def : StorePatImmOff<vec_t, store, or_is_add, STORE_V128>; +def : StorePatOffsetOnly<vec_t, store, STORE_V128>; +def : StorePatGlobalAddrOffOnly<vec_t, store, STORE_V128>; } //===----------------------------------------------------------------------===// @@ -90,7 +167,7 @@ def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>; // Constant: v128.const multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> { let isMoveImm = 1, isReMaterializable = 1, - Predicates = [HasSIMD128, HasUnimplementedSIMD128] in + Predicates = [HasUnimplementedSIMD128] in defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops, [(set V128:$dst, (vec_t pat))], "v128.const\t$dst, "#args, @@ -198,6 +275,19 @@ def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y), (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>; } +// Swizzle lanes: v8x16.swizzle +def wasm_swizzle_t : SDTypeProfile<1, 2, []>; +def wasm_swizzle : SDNode<"WebAssemblyISD::SWIZZLE", wasm_swizzle_t>; +let Predicates = [HasUnimplementedSIMD128] in +defm SWIZZLE : + SIMD_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins), + [(set (v16i8 V128:$dst), + (wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))], + "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 192>; + +def : Pat<(int_wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)), + (SWIZZLE V128:$src, V128:$mask)>; + // Create vector with identical lanes: splat def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>; def splat4 : PatFrag<(ops node:$x), (build_vector @@ -286,7 +376,7 @@ multiclass ExtractLaneExtended<string sign, bits<32> baseInst> { } defm "" : ExtractLaneExtended<"_s", 5>; -let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in +let Predicates = [HasUnimplementedSIMD128] in defm "" : ExtractLaneExtended<"_u", 6>; defm "" : ExtractLane<v4i32, "i32x4", LaneIdx4, I32, 13>; defm "" : ExtractLane<v2i64, "i64x2", LaneIdx2, I64, 16>; @@ -472,6 +562,11 @@ defm OR : SIMDBitwise<or, "or", 78>; defm XOR : SIMDBitwise<xor, "xor", 79>; } // isCommutable = 1 +// Bitwise logic: v128.andnot +def andnot : PatFrag<(ops node:$left, node:$right), (and $left, (vnot $right))>; +let Predicates = [HasUnimplementedSIMD128] in +defm ANDNOT : SIMDBitwise<andnot, "andnot", 216>; + // Bitwise select: v128.bitselect foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in defm BITSELECT_#vec_t : @@ -655,7 +750,7 @@ defm ABS : SIMDUnaryFP<fabs, "abs", 149>; defm NEG : SIMDUnaryFP<fneg, "neg", 150>; // Square root: sqrt -let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in +let Predicates = [HasUnimplementedSIMD128] in defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 151>; //===----------------------------------------------------------------------===// @@ -679,7 +774,7 @@ let isCommutable = 1 in defm MUL : SIMDBinaryFP<fmul, "mul", 156>; // Division: div -let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in +let Predicates = [HasUnimplementedSIMD128] in defm DIV : SIMDBinaryFP<fdiv, "div", 157>; // NaN-propagating minimum: min @@ -712,6 +807,42 @@ defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 172>; defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_f64x2_s", 173>; defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_f64x2_u", 174>; +// Widening operations +multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg, + bits<32> baseInst> { + defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_signed, + vec#".widen_low_"#arg#"_s", baseInst>; + defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_signed, + vec#".widen_high_"#arg#"_s", !add(baseInst, 1)>; + defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_unsigned, + vec#".widen_low_"#arg#"_u", !add(baseInst, 2)>; + defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_unsigned, + vec#".widen_high_"#arg#"_u", !add(baseInst, 3)>; +} + +defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 202>; +defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 206>; + +// Narrowing operations +multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg, + bits<32> baseInst> { + defm NARROW_S_#vec_t : + SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins), + [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_signed + (arg_t V128:$low), (arg_t V128:$high))))], + vec#".narrow_"#arg#"_s\t$dst, $low, $high", vec#".narrow_"#arg#"_s", + baseInst>; + defm NARROW_U_#vec_t : + SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins), + [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_unsigned + (arg_t V128:$low), (arg_t V128:$high))))], + vec#".narrow_"#arg#"_u\t$dst, $low, $high", vec#".narrow_"#arg#"_u", + !add(baseInst, 1)>; +} + +defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 198>; +defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 200>; + // Lower llvm.wasm.trunc.saturate.* to saturating instructions def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>; @@ -732,3 +863,25 @@ foreach t2 = !foldl( ) ) in def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>; + +//===----------------------------------------------------------------------===// +// Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS) +//===----------------------------------------------------------------------===// + +multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> { + defm QFMA_#vec_t : + SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), + (outs), (ins), + [(set (vec_t V128:$dst), + (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))], + vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>; + defm QFMS_#vec_t : + SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), + (outs), (ins), + [(set (vec_t V128:$dst), + (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))], + vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>; +} + +defm "" : SIMDQFM<v4f32, "f32x4", 0x98>; +defm "" : SIMDQFM<v2f64, "f64x2", 0xa3>; diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp index e92b34430272..75d04252cbe9 100644 --- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/WasmEHFuncInfo.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/Debug.h" using namespace llvm; #define DEBUG_TYPE "wasm-late-eh-prepare" @@ -131,7 +132,7 @@ bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) { auto InsertPos = MBB.begin(); if (InsertPos->isEHLabel()) // EH pad starts with an EH label ++InsertPos; - unsigned DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); + Register DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(), TII.get(WebAssembly::CATCH), DstReg); } @@ -168,7 +169,7 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { if (CatchPos->isEHLabel()) // EH pad starts with an EH label ++CatchPos; MachineInstr *Catch = &*CatchPos; - unsigned ExnReg = Catch->getOperand(0).getReg(); + Register ExnReg = Catch->getOperand(0).getReg(); BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW)) .addReg(ExnReg); TI->eraseFromParent(); @@ -233,6 +234,7 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables( // it. The pseudo instruction will be deleted later. bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) { const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); auto *EHInfo = MF.getWasmEHFuncInfo(); SmallVector<MachineInstr *, 16> ExtractInstrs; SmallVector<MachineInstr *, 8> ToDelete; @@ -292,7 +294,7 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) { // thenbb: // %exn:i32 = extract_exception // ... use exn ... - unsigned ExnReg = Catch->getOperand(0).getReg(); + Register ExnReg = Catch->getOperand(0).getReg(); auto *ThenMBB = MF.CreateMachineBasicBlock(); auto *ElseMBB = MF.CreateMachineBasicBlock(); MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB); @@ -339,9 +341,11 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) { WebAssembly::ClangCallTerminateFn); assert(ClangCallTerminateFn && "There is no __clang_call_terminate() function"); + Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + BuildMI(ElseMBB, DL, TII.get(WebAssembly::CONST_I32), Reg).addImm(0); BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL_VOID)) .addGlobalAddress(ClangCallTerminateFn) - .addImm(0); + .addReg(Reg); BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE)); } else { diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp index 34a8195ac4b4..4314aa611549 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp @@ -68,7 +68,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) { if (MI->getOpcode() != WebAssembly::BR_UNLESS) continue; - unsigned Cond = MI->getOperand(1).getReg(); + Register Cond = MI->getOperand(1).getReg(); bool Inverted = false; // Attempt to invert the condition in place. @@ -188,7 +188,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) { // If we weren't able to invert the condition in place. Insert an // instruction to invert it. if (!Inverted) { - unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + Register Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass); BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp) .addReg(Cond); MFI.stackifyVReg(Tmp); diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 960d5134f6e9..1cf397dd060b 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -227,15 +227,6 @@ static cl::list<std::string> namespace { class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass { - static const char *ResumeFName; - static const char *EHTypeIDFName; - static const char *EmLongjmpFName; - static const char *EmLongjmpJmpbufFName; - static const char *SaveSetjmpFName; - static const char *TestSetjmpFName; - static const char *FindMatchingCatchPrefix; - static const char *InvokePrefix; - bool EnableEH; // Enable exception handling bool EnableSjLj; // Enable setjmp/longjmp handling @@ -274,6 +265,7 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass { bool areAllExceptionsAllowed() const { return EHWhitelistSet.empty(); } bool canLongjmp(Module &M, const Value *Callee) const; + bool isEmAsmCall(Module &M, const Value *Callee) const; void rebuildSSA(Function &F); @@ -292,19 +284,6 @@ public: }; } // End anonymous namespace -const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException"; -const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName = - "llvm_eh_typeid_for"; -const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName = - "emscripten_longjmp"; -const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName = - "emscripten_longjmp_jmpbuf"; -const char *WebAssemblyLowerEmscriptenEHSjLj::SaveSetjmpFName = "saveSetjmp"; -const char *WebAssemblyLowerEmscriptenEHSjLj::TestSetjmpFName = "testSetjmp"; -const char *WebAssemblyLowerEmscriptenEHSjLj::FindMatchingCatchPrefix = - "__cxa_find_matching_catch_"; -const char *WebAssemblyLowerEmscriptenEHSjLj::InvokePrefix = "__invoke_"; - char WebAssemblyLowerEmscriptenEHSjLj::ID = 0; INITIALIZE_PASS(WebAssemblyLowerEmscriptenEHSjLj, DEBUG_TYPE, "WebAssembly Lower Emscripten Exceptions / Setjmp / Longjmp", @@ -335,7 +314,8 @@ static bool canThrow(const Value *V) { static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB, const char *Name) { - auto* GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty())); + auto *GV = + dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty())); if (!GV) report_fatal_error(Twine("unable to create global: ") + Name); @@ -376,9 +356,9 @@ WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M, PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); SmallVector<Type *, 16> Args(NumClauses, Int8PtrTy); FunctionType *FTy = FunctionType::get(Int8PtrTy, Args, false); - Function *F = - Function::Create(FTy, GlobalValue::ExternalLinkage, - FindMatchingCatchPrefix + Twine(NumClauses + 2), &M); + Function *F = Function::Create( + FTy, GlobalValue::ExternalLinkage, + "__cxa_find_matching_catch_" + Twine(NumClauses + 2), &M); FindMatchingCatches[NumClauses] = F; return F; } @@ -418,7 +398,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) { Args.append(CI->arg_begin(), CI->arg_end()); CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args); NewCall->takeName(CI); - NewCall->setCallingConv(CI->getCallingConv()); + NewCall->setCallingConv(CallingConv::WASM_EmscriptenInvoke); NewCall->setDebugLoc(CI->getDebugLoc()); // Because we added the pointer to the callee as first argument, all @@ -432,9 +412,22 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) { for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I) ArgAttributes.push_back(InvokeAL.getParamAttributes(I)); + AttrBuilder FnAttrs(InvokeAL.getFnAttributes()); + if (FnAttrs.contains(Attribute::AllocSize)) { + // The allocsize attribute (if any) referes to parameters by index and needs + // to be adjusted. + unsigned SizeArg; + Optional<unsigned> NEltArg; + std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs(); + SizeArg += 1; + if (NEltArg.hasValue()) + NEltArg = NEltArg.getValue() + 1; + FnAttrs.addAllocSizeAttr(SizeArg, NEltArg); + } + // Reconstruct the AttributesList based on the vector we constructed. AttributeList NewCallAL = - AttributeList::get(C, InvokeAL.getFnAttributes(), + AttributeList::get(C, AttributeSet::get(C, FnAttrs), InvokeAL.getRetAttributes(), ArgAttributes); NewCall->setAttributes(NewCallAL); @@ -473,8 +466,8 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) { FunctionType *FTy = FunctionType::get(CalleeFTy->getReturnType(), ArgTys, CalleeFTy->isVarArg()); - Function *F = Function::Create(FTy, GlobalValue::ExternalLinkage, - InvokePrefix + Sig, M); + Function *F = + Function::Create(FTy, GlobalValue::ExternalLinkage, "__invoke_" + Sig, M); InvokeWrappers[Sig] = F; return F; } @@ -491,39 +484,44 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M, // and can't be passed by pointer. The result is a crash with illegal IR. if (isa<InlineAsm>(Callee)) return false; + StringRef CalleeName = Callee->getName(); // The reason we include malloc/free here is to exclude the malloc/free // calls generated in setjmp prep / cleanup routines. - Function *SetjmpF = M.getFunction("setjmp"); - Function *MallocF = M.getFunction("malloc"); - Function *FreeF = M.getFunction("free"); - if (Callee == SetjmpF || Callee == MallocF || Callee == FreeF) + if (CalleeName == "setjmp" || CalleeName == "malloc" || CalleeName == "free") return false; // There are functions in JS glue code - if (Callee == ResumeF || Callee == EHTypeIDF || Callee == SaveSetjmpF || - Callee == TestSetjmpF) + if (CalleeName == "__resumeException" || CalleeName == "llvm_eh_typeid_for" || + CalleeName == "saveSetjmp" || CalleeName == "testSetjmp" || + CalleeName == "getTempRet0" || CalleeName == "setTempRet0") return false; // __cxa_find_matching_catch_N functions cannot longjmp - if (Callee->getName().startswith(FindMatchingCatchPrefix)) + if (Callee->getName().startswith("__cxa_find_matching_catch_")) return false; // Exception-catching related functions - Function *BeginCatchF = M.getFunction("__cxa_begin_catch"); - Function *EndCatchF = M.getFunction("__cxa_end_catch"); - Function *AllocExceptionF = M.getFunction("__cxa_allocate_exception"); - Function *ThrowF = M.getFunction("__cxa_throw"); - Function *TerminateF = M.getFunction("__clang_call_terminate"); - if (Callee == BeginCatchF || Callee == EndCatchF || - Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF || - Callee == GetTempRet0Func || Callee == SetTempRet0Func) + if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_end_catch" || + CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" || + CalleeName == "__clang_call_terminate") return false; // Otherwise we don't know return true; } +bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M, + const Value *Callee) const { + StringRef CalleeName = Callee->getName(); + // This is an exhaustive list from Emscripten's <emscripten/em_asm.h>. + return CalleeName == "emscripten_asm_const_int" || + CalleeName == "emscripten_asm_const_double" || + CalleeName == "emscripten_asm_const_int_sync_on_main_thread" || + CalleeName == "emscripten_asm_const_double_sync_on_main_thread" || + CalleeName == "emscripten_asm_const_async_on_main_thread"; +} + // Generate testSetjmp function call seqence with preamble and postamble. // The code this generates is equivalent to the following JavaScript code: // if (%__THREW__.val != 0 & threwValue != 0) { @@ -605,15 +603,12 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { SSAUpdater SSA; for (BasicBlock &BB : F) { for (Instruction &I : BB) { + SSA.Initialize(I.getType(), I.getName()); + SSA.AddAvailableValue(&BB, &I); for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE;) { Use &U = *UI; ++UI; - SSA.Initialize(I.getType(), I.getName()); - SSA.AddAvailableValue(&BB, &I); auto *User = cast<Instruction>(U.getUser()); - if (User->getParent() == &BB) - continue; - if (auto *UserPN = dyn_cast<PHINode>(User)) if (UserPN->getIncomingBlock(U) == &BB) continue; @@ -660,13 +655,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { FunctionType *ResumeFTy = FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false); ResumeF = Function::Create(ResumeFTy, GlobalValue::ExternalLinkage, - ResumeFName, &M); + "__resumeException", &M); // Register llvm_eh_typeid_for function FunctionType *EHTypeIDTy = FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false); EHTypeIDF = Function::Create(EHTypeIDTy, GlobalValue::ExternalLinkage, - EHTypeIDFName, &M); + "llvm_eh_typeid_for", &M); for (Function &F : M) { if (F.isDeclaration()) @@ -684,7 +679,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { // defined in JS code EmLongjmpJmpbufF = Function::Create(LongjmpF->getFunctionType(), GlobalValue::ExternalLinkage, - EmLongjmpJmpbufFName, &M); + "emscripten_longjmp_jmpbuf", &M); LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF); } @@ -697,19 +692,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { IRB.getInt32Ty()}; FunctionType *FTy = FunctionType::get(Type::getInt32PtrTy(C), Params, false); - SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, - SaveSetjmpFName, &M); + SaveSetjmpF = + Function::Create(FTy, GlobalValue::ExternalLinkage, "saveSetjmp", &M); // Register testSetjmp function Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}; FTy = FunctionType::get(IRB.getInt32Ty(), Params, false); - TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, - TestSetjmpFName, &M); + TestSetjmpF = + Function::Create(FTy, GlobalValue::ExternalLinkage, "testSetjmp", &M); FTy = FunctionType::get(IRB.getVoidTy(), {IRB.getInt32Ty(), IRB.getInt32Ty()}, false); EmLongjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, - EmLongjmpFName, &M); + "emscripten_longjmp", &M); // Only traverse functions that uses setjmp in order not to insert // unnecessary prep / cleanup code in every function @@ -970,10 +965,16 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { const Value *Callee = CI->getCalledValue(); if (!canLongjmp(M, Callee)) continue; + if (isEmAsmCall(M, Callee)) + report_fatal_error("Cannot use EM_ASM* alongside setjmp/longjmp in " + + F.getName() + + ". Please consider using EM_JS, or move the " + "EM_ASM into another function.", + false); Value *Threw = nullptr; BasicBlock *Tail; - if (Callee->getName().startswith(InvokePrefix)) { + if (Callee->getName().startswith("__invoke_")) { // If invoke wrapper has already been generated for this call in // previous EH phase, search for the load instruction // %__THREW__.val = __THREW__; diff --git a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp index 494d3fadbc8c..750b2233e67a 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp @@ -94,7 +94,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) { break; // Found a null terminator, skip the rest. Constant *Associated = CS->getOperand(2); - Associated = cast<Constant>(Associated->stripPointerCastsNoFollowAliases()); + Associated = cast<Constant>(Associated->stripPointerCasts()); DtorFuncs[PriorityValue][Associated].push_back(DtorFunc); } diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 288b991ae2c5..59c10243c545 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -79,7 +79,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( // Clang-provided symbols. if (strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0 || strcmp(Name, "__memory_base") == 0 || strcmp(Name, "__table_base") == 0 || - strcmp(Name, "__tls_size") == 0) { + strcmp(Name, "__tls_size") == 0 || strcmp(Name, "__tls_align") == 0) { bool Mutable = strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0; WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); @@ -115,7 +115,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( getLibcallSignature(Subtarget, Name, Returns, Params); } auto Signature = - make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params)); + std::make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params)); WasmSym->setSignature(Signature.get()); Printer.addSignature(std::move(Signature)); @@ -163,6 +163,21 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO, return MCOperand::createExpr(Expr); } +MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand( + SmallVector<wasm::ValType, 1> &&Returns, + SmallVector<wasm::ValType, 4> &&Params) const { + auto Signature = std::make_unique<wasm::WasmSignature>(std::move(Returns), + std::move(Params)); + MCSymbol *Sym = Printer.createTempSymbol("typeindex"); + auto *WasmSym = cast<MCSymbolWasm>(Sym); + WasmSym->setSignature(Signature.get()); + Printer.addSignature(std::move(Signature)); + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + const MCExpr *Expr = + MCSymbolRefExpr::create(WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx); + return MCOperand::createExpr(Expr); +} + // Return the WebAssembly type associated with the given register class. static wasm::ValType getType(const TargetRegisterClass *RC) { if (RC == &WebAssembly::I32RegClass) @@ -178,6 +193,16 @@ static wasm::ValType getType(const TargetRegisterClass *RC) { llvm_unreachable("Unexpected register class"); } +static void getFunctionReturns(const MachineInstr *MI, + SmallVectorImpl<wasm::ValType> &Returns) { + const Function &F = MI->getMF()->getFunction(); + const TargetMachine &TM = MI->getMF()->getTarget(); + Type *RetTy = F.getReturnType(); + SmallVector<MVT, 4> CallerRetTys; + computeLegalValueVTs(F, TM, RetTy, CallerRetTys); + valTypesFromMVTs(CallerRetTys, Returns); +} + void WebAssemblyMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); @@ -208,8 +233,6 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, if (I < Desc.NumOperands) { const MCOperandInfo &Info = Desc.OpInfo[I]; if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) { - MCSymbol *Sym = Printer.createTempSymbol("typeindex"); - SmallVector<wasm::ValType, 4> Returns; SmallVector<wasm::ValType, 4> Params; @@ -226,17 +249,23 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, if (WebAssembly::isCallIndirect(MI->getOpcode())) Params.pop_back(); - auto *WasmSym = cast<MCSymbolWasm>(Sym); - auto Signature = make_unique<wasm::WasmSignature>(std::move(Returns), - std::move(Params)); - WasmSym->setSignature(Signature.get()); - Printer.addSignature(std::move(Signature)); - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + // return_call_indirect instructions have the return type of the + // caller + if (MI->getOpcode() == WebAssembly::RET_CALL_INDIRECT) + getFunctionReturns(MI, Returns); - const MCExpr *Expr = MCSymbolRefExpr::create( - WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx); - MCOp = MCOperand::createExpr(Expr); + MCOp = lowerTypeIndexOperand(std::move(Returns), std::move(Params)); break; + } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) { + auto BT = static_cast<WebAssembly::BlockType>(MO.getImm()); + assert(BT != WebAssembly::BlockType::Invalid); + if (BT == WebAssembly::BlockType::Multivalue) { + SmallVector<wasm::ValType, 1> Returns; + getFunctionReturns(MI, Returns); + MCOp = lowerTypeIndexOperand(std::move(Returns), + SmallVector<wasm::ValType, 4>()); + break; + } } } MCOp = MCOperand::createImm(MO.getImm()); diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h index 2c375a01a7f5..d79c54097eb7 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H +#include "llvm/BinaryFormat/Wasm.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Compiler.h" @@ -33,6 +34,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + MCOperand lowerTypeIndexOperand(SmallVector<wasm::ValType, 1> &&, + SmallVector<wasm::ValType, 4> &&) const; public: WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer) diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index d31c1226bfdb..e4cc2389147b 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -49,10 +49,12 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F, computeLegalValueVTs(F, TM, Ty->getReturnType(), Results); MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); - if (Results.size() > 1) { - // WebAssembly currently can't lower returns of multiple values without - // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So - // replace multiple return values with a pointer parameter. + if (Results.size() > 1 && + !TM.getSubtarget<WebAssemblySubtarget>(F).hasMultivalue()) { + // WebAssembly can't lower returns of multiple values without demoting to + // sret unless multivalue is enabled (see + // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return + // values with a poitner parameter. Results.clear(); Params.push_back(PtrVT); } @@ -72,7 +74,7 @@ void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In, std::unique_ptr<wasm::WasmSignature> llvm::signatureFromMVTs(const SmallVectorImpl<MVT> &Results, const SmallVectorImpl<MVT> &Params) { - auto Sig = make_unique<wasm::WasmSignature>(); + auto Sig = std::make_unique<wasm::WasmSignature>(); valTypesFromMVTs(Results, Sig->Returns); valTypesFromMVTs(Params, Sig->Params); return Sig; diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 4b9ba491dee6..16e2f4392984 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -96,13 +96,18 @@ public: void stackifyVReg(unsigned VReg) { assert(MF.getRegInfo().getUniqueVRegDef(VReg)); - auto I = TargetRegisterInfo::virtReg2Index(VReg); + auto I = Register::virtReg2Index(VReg); if (I >= VRegStackified.size()) VRegStackified.resize(I + 1); VRegStackified.set(I); } + void unstackifyVReg(unsigned VReg) { + auto I = Register::virtReg2Index(VReg); + if (I < VRegStackified.size()) + VRegStackified.reset(I); + } bool isVRegStackified(unsigned VReg) const { - auto I = TargetRegisterInfo::virtReg2Index(VReg); + auto I = Register::virtReg2Index(VReg); if (I >= VRegStackified.size()) return false; return VRegStackified.test(I); @@ -111,12 +116,12 @@ public: void initWARegs(); void setWAReg(unsigned VReg, unsigned WAReg) { assert(WAReg != UnusedReg); - auto I = TargetRegisterInfo::virtReg2Index(VReg); + auto I = Register::virtReg2Index(VReg); assert(I < WARegs.size()); WARegs[I] = WAReg; } unsigned getWAReg(unsigned VReg) const { - auto I = TargetRegisterInfo::virtReg2Index(VReg); + auto I = Register::virtReg2Index(VReg); assert(I < WARegs.size()); return WARegs[I]; } diff --git a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp index 7ac0511c28b0..ac428fcc826a 100644 --- a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp @@ -166,8 +166,8 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI, if (!LibInfo.getLibFunc(Name, Func)) return false; - unsigned FromReg = MI.getOperand(2).getReg(); - unsigned ToReg = MI.getOperand(0).getReg(); + Register FromReg = MI.getOperand(2).getReg(); + Register ToReg = MI.getOperand(0).getReg(); if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg)) report_fatal_error("Memory Intrinsic results: call to builtin function " "with wrong signature, from/to mismatch"); @@ -184,7 +184,8 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) { auto &MDT = getAnalysis<MachineDominatorTree>(); const WebAssemblyTargetLowering &TLI = *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering(); - const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const auto &LibInfo = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(MF.getFunction()); auto &LIS = getAnalysis<LiveIntervals>(); bool Changed = false; diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp index 8c7c3305c201..0bd30791e57c 100644 --- a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp +++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp @@ -81,7 +81,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction( // Split multiple-VN LiveIntervals into multiple LiveIntervals. SmallVector<LiveInterval *, 4> SplitLIs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + unsigned Reg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(Reg)) continue; diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp index d20352259e07..9b60596e42b4 100644 --- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp +++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -64,11 +64,8 @@ void OptimizeReturned::visitCallSite(CallSite CS) { if (isa<Constant>(Arg)) continue; // Like replaceDominatedUsesWith but using Instruction/Use dominance. - for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) { - Use &U = *UI++; - if (DT->dominates(Inst, U)) - U.set(Inst); - } + Arg->replaceUsesWithIf(Inst, + [&](Use &U) { return DT->dominates(Inst, U); }); } } diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp index e11cdeaa0e79..ea6cd09a604c 100644 --- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp +++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp @@ -63,7 +63,7 @@ static bool maybeRewriteToDrop(unsigned OldReg, unsigned NewReg, bool Changed = false; if (OldReg == NewReg) { Changed = true; - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg)); + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg)); MO.setReg(NewReg); MO.setIsDead(); MFI.stackifyVReg(NewReg); @@ -75,9 +75,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, const MachineFunction &MF, WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI, - const WebAssemblyInstrInfo &TII, - unsigned FallthroughOpc, - unsigned CopyLocalOpc) { + const WebAssemblyInstrInfo &TII) { if (DisableWebAssemblyFallthroughReturnOpt) return false; if (&MBB != &MF.back()) @@ -90,13 +88,36 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, if (&MI != &*End) return false; - if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) { - // If the operand isn't stackified, insert a COPY to read the operand and - // stackify it. - MachineOperand &MO = MI.getOperand(0); - unsigned Reg = MO.getReg(); + for (auto &MO : MI.explicit_operands()) { + // If the operand isn't stackified, insert a COPY to read the operands and + // stackify them. + Register Reg = MO.getReg(); if (!MFI.isVRegStackified(Reg)) { - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + unsigned CopyLocalOpc; + const TargetRegisterClass *RegClass = MRI.getRegClass(Reg); + switch (RegClass->getID()) { + case WebAssembly::I32RegClassID: + CopyLocalOpc = WebAssembly::COPY_I32; + break; + case WebAssembly::I64RegClassID: + CopyLocalOpc = WebAssembly::COPY_I64; + break; + case WebAssembly::F32RegClassID: + CopyLocalOpc = WebAssembly::COPY_F32; + break; + case WebAssembly::F64RegClassID: + CopyLocalOpc = WebAssembly::COPY_F64; + break; + case WebAssembly::V128RegClassID: + CopyLocalOpc = WebAssembly::COPY_V128; + break; + case WebAssembly::EXNREFRegClassID: + CopyLocalOpc = WebAssembly::COPY_EXNREF; + break; + default: + llvm_unreachable("Unexpected register class for return operand"); + } + Register NewReg = MRI.createVirtualRegister(RegClass); BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg) .addReg(Reg); MO.setReg(NewReg); @@ -104,8 +125,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, } } - // Rewrite the return. - MI.setDesc(TII.get(FallthroughOpc)); + MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN)); return true; } @@ -120,7 +140,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); const WebAssemblyTargetLowering &TLI = *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering(); - auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &LibInfo = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(MF.getFunction()); bool Changed = false; for (auto &MBB : MF) @@ -143,8 +164,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { report_fatal_error("Peephole: call to builtin function with " "wrong signature, not consuming reg"); MachineOperand &MO = MI.getOperand(0); - unsigned OldReg = MO.getReg(); - unsigned NewReg = Op2.getReg(); + Register OldReg = MO.getReg(); + Register NewReg = Op2.getReg(); if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg)) report_fatal_error("Peephole: call to builtin function with " @@ -156,60 +177,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { break; } // Optimize away an explicit void return at the end of the function. - case WebAssembly::RETURN_I32: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32, - WebAssembly::COPY_I32); - break; - case WebAssembly::RETURN_I64: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64, - WebAssembly::COPY_I64); - break; - case WebAssembly::RETURN_F32: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32, - WebAssembly::COPY_F32); - break; - case WebAssembly::RETURN_F64: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64, - WebAssembly::COPY_F64); - break; - case WebAssembly::RETURN_v16i8: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8, - WebAssembly::COPY_V128); - break; - case WebAssembly::RETURN_v8i16: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16, - WebAssembly::COPY_V128); - break; - case WebAssembly::RETURN_v4i32: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32, - WebAssembly::COPY_V128); - break; - case WebAssembly::RETURN_v2i64: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2i64, - WebAssembly::COPY_V128); - break; - case WebAssembly::RETURN_v4f32: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32, - WebAssembly::COPY_V128); - break; - case WebAssembly::RETURN_v2f64: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2f64, - WebAssembly::COPY_V128); - break; - case WebAssembly::RETURN_VOID: - Changed |= maybeRewriteToFallthrough( - MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID, - WebAssembly::INSTRUCTION_LIST_END); + case WebAssembly::RETURN: + Changed |= maybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII); break; } diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp index 3bfbf607344d..799b9388097c 100644 --- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp +++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp @@ -95,7 +95,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction( // TODO: This is fairly heavy-handed; find a better approach. // for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + unsigned Reg = Register::index2VirtReg(I); // Skip unused registers. if (MRI.use_nodbg_empty(Reg)) diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp index 6f09c45b6642..043b6f1b7d18 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -98,7 +98,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Interesting register intervals:\n"); for (unsigned I = 0; I < NumVRegs; ++I) { - unsigned VReg = TargetRegisterInfo::index2VirtReg(I); + unsigned VReg = Register::index2VirtReg(I); if (MFI.isVRegStackified(VReg)) continue; // Skip unused registers, which can use $drop. @@ -157,9 +157,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { Changed |= Old != New; UsedColors.set(Color); Assignments[Color].push_back(LI); - LLVM_DEBUG( - dbgs() << "Assigning vreg" << TargetRegisterInfo::virtReg2Index(LI->reg) - << " to vreg" << TargetRegisterInfo::virtReg2Index(New) << "\n"); + LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg) + << " to vreg" << Register::virtReg2Index(New) << "\n"); } if (!Changed) return false; diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp index cdca23f55b29..72e7a7cf5042 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp @@ -89,7 +89,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { // Start the numbering for locals after the arg regs unsigned CurReg = MFI.getParams().size(); for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) { - unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx); + unsigned VReg = Register::index2VirtReg(VRegIdx); // Skip unused registers. if (MRI.use_empty(VReg)) continue; diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index a120a6471014..421d353a89e8 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -120,7 +120,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI, Type::getDoubleTy(MF.getFunction().getContext()))); MI->addOperand(MachineOperand::CreateFPImm(Val)); } else if (RegClass == &WebAssembly::V128RegClass) { - unsigned TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32)); MI->addOperand(MachineOperand::CreateReg(TempReg, false)); MachineInstr *Const = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), @@ -334,14 +334,14 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, for (const MachineOperand &MO : Def->operands()) { if (!MO.isReg() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // If the register is dead here and at Insert, ignore it. if (MO.isDead() && Insert->definesRegister(Reg) && !Insert->readsRegister(Reg)) continue; - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (Register::isPhysicalRegister(Reg)) { // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions // from moving down, and we've already checked for that. if (Reg == WebAssembly::ARGUMENTS) @@ -436,8 +436,8 @@ static bool oneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse, const MachineOperand &MO = UseInst->getOperand(0); if (!MO.isReg()) return false; - unsigned DefReg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DefReg) || + Register DefReg = MO.getReg(); + if (!Register::isVirtualRegister(DefReg) || !MFI.isVRegStackified(DefReg)) return false; assert(MRI.hasOneNonDBGUse(DefReg)); @@ -499,7 +499,7 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op, } else { // The register may have unrelated uses or defs; create a new register for // just our one def and use so that we can stackify it. - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); Def->getOperand(0).setReg(NewReg); Op.setReg(NewReg); @@ -535,7 +535,7 @@ static MachineInstr *rematerializeCheapDef( WebAssemblyDebugValueManager DefDIs(&Def); - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI); Op.setReg(NewReg); MachineInstr *Clone = &*std::prev(Insert); @@ -607,8 +607,8 @@ static MachineInstr *moveAndTeeForMultiUse( // Create the Tee and attach the registers. const auto *RegClass = MRI.getRegClass(Reg); - unsigned TeeReg = MRI.createVirtualRegister(RegClass); - unsigned DefReg = MRI.createVirtualRegister(RegClass); + Register TeeReg = MRI.createVirtualRegister(RegClass); + Register DefReg = MRI.createVirtualRegister(RegClass); MachineOperand &DefMO = Def->getOperand(0); MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(), TII->get(getTeeOpcode(RegClass)), TeeReg) @@ -807,11 +807,11 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { if (!Op.isReg()) continue; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); assert(Op.isUse() && "explicit_uses() should only iterate over uses"); assert(!Op.isImplicit() && "explicit_uses() should only iterate over explicit operands"); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) continue; // Identify the definition for this register at this point. @@ -915,7 +915,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { for (MachineOperand &MO : reverse(MI.explicit_operands())) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MFI.isVRegStackified(Reg)) { if (MO.isDef()) diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index ea9cfc00adfd..789a025794ea 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -91,8 +91,8 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex( if (MI.getOpcode() == WebAssembly::ADD_I32) { MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum); if (OtherMO.isReg()) { - unsigned OtherMOReg = OtherMO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) { + Register OtherMOReg = OtherMO.getReg(); + if (Register::isVirtualRegister(OtherMOReg)) { MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg); // TODO: For now we just opportunistically do this in the case where // the CONST_I32 happens to have exactly one def and one use. We @@ -117,7 +117,7 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex( // Create i32.add SP, offset and make it the operand. const TargetRegisterClass *PtrRC = MRI.getTargetRegisterInfo()->getPointerRegClass(MF); - unsigned OffsetOp = MRI.createVirtualRegister(PtrRC); + Register OffsetOp = MRI.createVirtualRegister(PtrRC); BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetOp) .addImm(FrameOffset); diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 7e65368e671a..bdf5fe2620a4 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -140,7 +140,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU, std::string FS) const { auto &I = SubtargetMap[CPU + FS]; if (!I) { - I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this); + I = std::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this); } return I.get(); } diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 46ef765ce0f4..1c53e90daea7 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -25,10 +25,11 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const { return TargetTransformInfo::PSK_FastHardware; } -unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) { - unsigned Result = BaseT::getNumberOfRegisters(Vector); +unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const { + unsigned Result = BaseT::getNumberOfRegisters(ClassID); // For SIMD, use at least 16 registers, as a rough guess. + bool Vector = (ClassID == 1); if (Vector) Result = std::max(Result, 16u); diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 1b11b4b631eb..f0ecc73e91de 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -53,7 +53,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index e9d88d4818a5..a237da8154ab 100644 --- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -32,9 +32,8 @@ bool WebAssembly::isChild(const MachineInstr &MI, const MachineOperand &MO = MI.getOperand(0); if (!MO.isReg() || MO.isImplicit() || !MO.isDef()) return false; - unsigned Reg = MO.getReg(); - return TargetRegisterInfo::isVirtualRegister(Reg) && - MFI.isVRegStackified(Reg); + Register Reg = MO.getReg(); + return Register::isVirtualRegister(Reg) && MFI.isVRegStackified(Reg); } bool WebAssembly::mayThrow(const MachineInstr &MI) { @@ -51,7 +50,21 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) { return false; const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI.getOpcode())); - assert(MO.isGlobal()); + assert(MO.isGlobal() || MO.isSymbol()); + + if (MO.isSymbol()) { + // Some intrinsics are lowered to calls to external symbols, which are then + // lowered to calls to library functions. Most of libcalls don't throw, but + // we only list some of them here now. + // TODO Consider adding 'nounwind' info in TargetLowering::CallLoweringInfo + // instead for more accurate info. + const char *Name = MO.getSymbolName(); + if (strcmp(Name, "memcpy") == 0 || strcmp(Name, "memmove") == 0 || + strcmp(Name, "memset") == 0) + return false; + return true; + } + const auto *F = dyn_cast<Function>(MO.getGlobal()); if (!F) return true; diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 95cbf46d37ed..25be79ec2b1e 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -870,6 +870,14 @@ private: bool parseDirectiveFPOEndProc(SMLoc L); bool parseDirectiveFPOData(SMLoc L); + /// SEH directives. + bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo); + bool parseDirectiveSEHPushReg(SMLoc); + bool parseDirectiveSEHSetFrame(SMLoc); + bool parseDirectiveSEHSaveReg(SMLoc); + bool parseDirectiveSEHSaveXMM(SMLoc); + bool parseDirectiveSEHPushFrame(SMLoc); + unsigned checkTargetMatchPredicate(MCInst &Inst) override; bool validateInstruction(MCInst &Inst, const OperandVector &Ops); @@ -955,6 +963,8 @@ private: public: enum X86MatchResultTy { Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY, +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "X86GenAsmMatcher.inc" }; X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, @@ -3173,6 +3183,13 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, EmitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; + case Match_InvalidImmUnsignedi4: { + SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate must be an integer in range [0, 15]", + EmptyRange, MatchingInlineAsm); + } case Match_MissingFeature: return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm); case Match_InvalidOperand: @@ -3520,6 +3537,15 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, MatchingInlineAsm); } + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidImmUnsignedi4) == 1) { + SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate must be an integer in range [0, 15]", + EmptyRange, MatchingInlineAsm); + } + // If all of these were an outright failure, report it in a useless way. return Error(IDLoc, "unknown instruction mnemonic", EmptyRange, MatchingInlineAsm); @@ -3572,6 +3598,16 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveFPOEndPrologue(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_endproc") return parseDirectiveFPOEndProc(DirectiveID.getLoc()); + else if (IDVal == ".seh_pushreg") + return parseDirectiveSEHPushReg(DirectiveID.getLoc()); + else if (IDVal == ".seh_setframe") + return parseDirectiveSEHSetFrame(DirectiveID.getLoc()); + else if (IDVal == ".seh_savereg") + return parseDirectiveSEHSaveReg(DirectiveID.getLoc()); + else if (IDVal == ".seh_savexmm") + return parseDirectiveSEHSaveXMM(DirectiveID.getLoc()); + else if (IDVal == ".seh_pushframe") + return parseDirectiveSEHPushFrame(DirectiveID.getLoc()); return true; } @@ -3708,6 +3744,140 @@ bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) { return getTargetStreamer().emitFPOEndProc(L); } +bool X86AsmParser::parseSEHRegisterNumber(unsigned RegClassID, + unsigned &RegNo) { + SMLoc startLoc = getLexer().getLoc(); + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + + // Try parsing the argument as a register first. + if (getLexer().getTok().isNot(AsmToken::Integer)) { + SMLoc endLoc; + if (ParseRegister(RegNo, startLoc, endLoc)) + return true; + + if (!X86MCRegisterClasses[RegClassID].contains(RegNo)) { + return Error(startLoc, + "register is not supported for use with this directive"); + } + } else { + // Otherwise, an integer number matching the encoding of the desired + // register may appear. + int64_t EncodedReg; + if (getParser().parseAbsoluteExpression(EncodedReg)) + return true; + + // The SEH register number is the same as the encoding register number. Map + // from the encoding back to the LLVM register number. + RegNo = 0; + for (MCPhysReg Reg : X86MCRegisterClasses[RegClassID]) { + if (MRI->getEncodingValue(Reg) == EncodedReg) { + RegNo = Reg; + break; + } + } + if (RegNo == 0) { + return Error(startLoc, + "incorrect register number for use with this directive"); + } + } + + return false; +} + +bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) { + unsigned Reg = 0; + if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFIPushReg(Reg, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) { + unsigned Reg = 0; + int64_t Off; + if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify a stack pointer offset"); + + getParser().Lex(); + if (getParser().parseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFISetFrame(Reg, Off, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) { + unsigned Reg = 0; + int64_t Off; + if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify an offset on the stack"); + + getParser().Lex(); + if (getParser().parseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFISaveReg(Reg, Off, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) { + unsigned Reg = 0; + int64_t Off; + if (parseSEHRegisterNumber(X86::VR128XRegClassID, Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify an offset on the stack"); + + getParser().Lex(); + if (getParser().parseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) { + bool Code = false; + StringRef CodeID; + if (getLexer().is(AsmToken::At)) { + SMLoc startLoc = getLexer().getLoc(); + getParser().Lex(); + if (!getParser().parseIdentifier(CodeID)) { + if (CodeID != "code") + return Error(startLoc, "expected @code"); + Code = true; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFIPushFrame(Code, Loc); + return false; +} + // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target()); diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 5bc979d1f18c..e9be28ca77b0 100644 --- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -35,6 +35,10 @@ inline bool isImmUnsignedi8Value(uint64_t Value) { return isUInt<8>(Value) || isInt<8>(Value); } +inline bool isImmUnsignedi4Value(uint64_t Value) { + return isUInt<4>(Value); +} + } // End of namespace llvm #endif diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index a771ba366318..3a76d023e640 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -260,6 +260,15 @@ struct X86Operand final : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } + bool isImmUnsignedi4() const { + if (!isImm()) return false; + // If this isn't a constant expr, reject it. The immediate byte is shared + // with a register encoding. We can't have it affected by a relocation. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + return isImmUnsignedi4Value(CE->getValue()); + } + bool isImmUnsignedi8() const { if (!isImm()) return false; // If this isn't a constant expr, just assume it fits and let relaxation @@ -491,7 +500,7 @@ struct X86Operand final : public MCParsedAsmOperand { void addGR32orGR64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - unsigned RegNo = getReg(); + MCRegister RegNo = getReg(); if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) RegNo = getX86SubSuperRegister(RegNo, 32); Inst.addOperand(MCOperand::createReg(RegNo)); @@ -572,7 +581,7 @@ struct X86Operand final : public MCParsedAsmOperand { static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) { SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); - auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc); + auto Res = std::make_unique<X86Operand>(Token, Loc, EndLoc); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); return Res; @@ -582,7 +591,7 @@ struct X86Operand final : public MCParsedAsmOperand { CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(), StringRef SymName = StringRef(), void *OpDecl = nullptr) { - auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; Res->AddressOf = AddressOf; Res->OffsetOfLoc = OffsetOfLoc; @@ -593,19 +602,19 @@ struct X86Operand final : public MCParsedAsmOperand { static std::unique_ptr<X86Operand> CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) { - return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc); + return std::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc); } static std::unique_ptr<X86Operand> CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) { - auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Prefix, StartLoc, EndLoc); Res->Pref.Prefixes = Prefixes; return Res; } static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc) { - auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); Res->Imm.Val = Val; return Res; } @@ -615,7 +624,7 @@ struct X86Operand final : public MCParsedAsmOperand { CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), void *OpDecl = nullptr, unsigned FrontendSize = 0) { - auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; Res->Mem.BaseReg = 0; @@ -643,7 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand { // The scale should always be one of {1,2,4,8}. assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && "Invalid scale!"); - auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = SegReg; Res->Mem.Disp = Disp; Res->Mem.BaseReg = BaseReg; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index a241362a271d..e287f6625115 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -12,13 +12,14 @@ // //===----------------------------------------------------------------------===// +#include "X86DisassemblerDecoder.h" +#include "llvm/ADT/StringRef.h" + #include <cstdarg> /* for va_*() */ #include <cstdio> /* for vsnprintf() */ #include <cstdlib> /* for exit() */ #include <cstring> /* for memset() */ -#include "X86DisassemblerDecoder.h" - using namespace llvm::X86Disassembler; /// Specifies whether a ModR/M byte is needed and (if so) which diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 54413fa1a02f..f08fcb575bf0 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -287,7 +287,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const { // Relax if the value is too big for a (signed) i8. - return int64_t(Value) != int64_t(int8_t(Value)); + return !isInt<8>(Value); } // FIXME: Can tblgen help at all here to verify there aren't other instructions @@ -557,7 +557,7 @@ protected: // If the frame pointer is other than esp/rsp, we do not have a way to // generate a compact unwinding representation, so bail out. - if (MRI.getLLVMRegNum(Inst.getRegister(), true) != + if (*MRI.getLLVMRegNum(Inst.getRegister(), true) != (Is64Bit ? X86::RBP : X86::EBP)) return 0; @@ -605,7 +605,7 @@ protected: // unwind encoding. return CU::UNWIND_MODE_DWARF; - unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true); SavedRegs[SavedRegIdx++] = Reg; StackAdjust += OffsetSize; InstrOffset += PushInstrSize(Reg); diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 232a06593238..bd009da60851 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -46,10 +46,10 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; -static X86_64RelType getType64(unsigned Kind, +static X86_64RelType getType64(MCFixupKind Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel) { - switch (Kind) { + switch (unsigned(Kind)) { default: llvm_unreachable("Unimplemented"); case FK_NONE: @@ -97,7 +97,7 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) { static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, MCSymbolRefExpr::VariantKind Modifier, X86_64RelType Type, bool IsPCRel, - unsigned Kind) { + MCFixupKind Kind) { switch (Modifier) { default: llvm_unreachable("Unimplemented"); @@ -202,7 +202,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, // and we want to keep back-compatibility. if (!Ctx.getAsmInfo()->canRelaxRelocations()) return ELF::R_X86_64_GOTPCREL; - switch (Kind) { + switch (unsigned(Kind)) { default: return ELF::R_X86_64_GOTPCREL; case X86::reloc_riprel_4byte_relax: @@ -237,7 +237,7 @@ static X86_32RelType getType32(X86_64RelType T) { static unsigned getRelocType32(MCContext &Ctx, MCSymbolRefExpr::VariantKind Modifier, X86_32RelType Type, bool IsPCRel, - unsigned Kind) { + MCFixupKind Kind) { switch (Modifier) { default: llvm_unreachable("Unimplemented"); @@ -265,8 +265,9 @@ static unsigned getRelocType32(MCContext &Ctx, if (!Ctx.getAsmInfo()->canRelaxRelocations()) return ELF::R_386_GOT32; - return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X - : ELF::R_386_GOT32; + return Kind == MCFixupKind(X86::reloc_signed_4byte_relax) + ? ELF::R_386_GOT32X + : ELF::R_386_GOT32; case MCSymbolRefExpr::VK_GOTOFF: assert(Type == RT32_32); assert(!IsPCRel); @@ -317,7 +318,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); - unsigned Kind = Fixup.getKind(); + MCFixupKind Kind = Fixup.getKind(); X86_64RelType Type = getType64(Kind, Modifier, IsPCRel); if (getEMachine() == ELF::EM_X86_64) return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind); @@ -329,5 +330,5 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, std::unique_ptr<MCObjectTargetWriter> llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) { - return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine); + return std::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index e1125c176b25..d986c829d98e 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -163,5 +163,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { TextAlignFillValue = 0x90; + AllowAtInName = true; + UseIntegratedAssembler = true; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 31d26d08a63f..ac36bf3a12fa 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -862,6 +862,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, VEX_B = ~(BaseRegEnc >> 3) & 1; unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; + if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV. + EVEX_V2 = ~(IndexRegEnc >> 4) & 1; + break; } case X86II::MRMSrcReg: { diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index ce05ad974507..ced9eacc8b97 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -70,6 +70,10 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) { return DWARFFlavour::X86_32_Generic; } +bool X86_MC::hasLockPrefix(const MCInst &MI) { + return MI.getFlags() & X86::IP_HAS_LOCK; +} + void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { // FIXME: TableGen these. for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) { @@ -399,6 +403,9 @@ public: findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents, uint64_t GotSectionVA, const Triple &TargetTriple) const override; + Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst, + uint64_t Addr, + uint64_t Size) const override; }; #define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS @@ -511,7 +518,31 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries( return findX86_64PltEntries(PltSectionVA, PltContents); default: return {}; - } + } +} + +Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress( + const MCInst &Inst, uint64_t Addr, uint64_t Size) const { + const MCInstrDesc &MCID = Info->get(Inst.getOpcode()); + int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags); + if (MemOpStart == -1) + return None; + MemOpStart += X86II::getOperandBias(MCID); + + const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg); + const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg); + const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg); + const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt); + const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp); + if (SegReg.getReg() != 0 || IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 || + !Disp.isImm()) + return None; + + // RIP-relative addressing. + if (BaseReg.getReg() == X86::RIP) + return Addr + Size + Disp.getImm(); + + return None; } } // end of namespace X86_MC @@ -567,13 +598,13 @@ extern "C" void LLVMInitializeX86TargetMC() { createX86_64AsmBackend); } -unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, - bool High) { +MCRegister llvm::getX86SubSuperRegisterOrZero(MCRegister Reg, unsigned Size, + bool High) { switch (Size) { - default: return 0; + default: return X86::NoRegister; case 8: if (High) { - switch (Reg) { + switch (Reg.id()) { default: return getX86SubSuperRegisterOrZero(Reg, 64); case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::SI; @@ -593,8 +624,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, return X86::BH; } } else { - switch (Reg) { - default: return 0; + switch (Reg.id()) { + default: return X86::NoRegister; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AL; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -630,8 +661,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, } } case 16: - switch (Reg) { - default: return 0; + switch (Reg.id()) { + default: return X86::NoRegister; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -666,8 +697,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, return X86::R15W; } case 32: - switch (Reg) { - default: return 0; + switch (Reg.id()) { + default: return X86::NoRegister; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::EAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -702,7 +733,7 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, return X86::R15D; } case 64: - switch (Reg) { + switch (Reg.id()) { default: return 0; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::RAX; @@ -740,9 +771,9 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, } } -unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) { - unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High); - assert(Res != 0 && "Unexpected register or VT"); +MCRegister llvm::getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High) { + MCRegister Res = getX86SubSuperRegisterOrZero(Reg, Size, High); + assert(Res != X86::NoRegister && "Unexpected register or VT"); return Res; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 00dd5908cbf5..0c789061f0e1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H +#include "llvm/MC/MCRegister.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/DataTypes.h" #include <string> @@ -57,6 +58,10 @@ unsigned getDwarfRegFlavour(const Triple &TT, bool isEH); void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI); + +/// Returns true if this instruction has a LOCK prefix. +bool hasLockPrefix(const MCInst &MI); + /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. /// do not need to go through TargetRegistry. MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, @@ -111,12 +116,12 @@ createX86WinCOFFObjectWriter(bool Is64Bit); /// Returns the sub or super register of a specific X86 register. /// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. /// Aborts on error. -unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false); +MCRegister getX86SubSuperRegister(MCRegister, unsigned, bool High=false); /// Returns the sub or super register of a specific X86 register. /// Like getX86SubSuperRegister() but returns 0 on error. -unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, - bool High = false); +MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned, + bool High = false); } // End llvm namespace diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index fc7e99f61e5e..b67a7508fe72 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -276,7 +276,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( // x86_64 distinguishes movq foo@GOTPCREL so that the linker can // rewrite the movq to an leaq at link time if the symbol ends up in // the same linkage unit. - if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load) + if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load) Type = MachO::X86_64_RELOC_GOT_LOAD; else Type = MachO::X86_64_RELOC_GOT; @@ -339,8 +339,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( return; } else { Type = MachO::X86_64_RELOC_UNSIGNED; - unsigned Kind = Fixup.getKind(); - if (Kind == X86::reloc_signed_4byte) { + if (Fixup.getTargetKind() == X86::reloc_signed_4byte) { Asm.getContext().reportError( Fixup.getLoc(), "32-bit absolute addressing is not supported in 64-bit mode"); @@ -600,5 +599,5 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, std::unique_ptr<MCObjectTargetWriter> llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) { - return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype); + return std::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype); } diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 3baab9da1c41..760239f76505 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -109,5 +109,5 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createX86WinCOFFObjectWriter(bool Is64Bit) { - return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit); + return std::make_unique<X86WinCOFFObjectWriter>(Is64Bit); } diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 796a27a17255..db624378d517 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -35,8 +35,9 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { MCStreamer::EmitWinEHHandlerData(Loc); // We have to emit the unwind info now, because this directive - // actually switches to the .xdata section! - EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo()); + // actually switches to the .xdata section. + if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo()) + EHStreamer.EmitUnwindInfo(*this, CurFrame); } void X86WinCOFFStreamer::EmitWindowsUnwindTables() { diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index e9987d1f62bd..d5494ef12370 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -170,7 +170,7 @@ bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym, L, "opening new .cv_fpo_proc before closing previous frame"); return true; } - CurFPOData = llvm::make_unique<FPOData>(); + CurFPOData = std::make_unique<FPOData>(); CurFPOData->Function = ProcSym; CurFPOData->Begin = emitFPOLabel(); CurFPOData->ParamsSize = ParamsSize; diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index a95f68434d12..6840fc12751d 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -81,6 +81,12 @@ FunctionPass *createX86FlagsCopyLoweringPass(); /// Return a pass that expands WinAlloca pseudo-instructions. FunctionPass *createX86WinAllocaExpander(); +/// Return a pass that inserts int3 at the end of the function if it ends with a +/// CALL instruction. The pass does the same for each funclet as well. This +/// ensures that the open interval of function start and end PCs contains all +/// return addresses for the benefit of the Windows x64 unwinder. +FunctionPass *createX86AvoidTrailingCallPass(); + /// Return a pass that optimizes the code-size of x86 call sequences. This is /// done by replacing esp-relative movs with pushes. FunctionPass *createX86CallFrameOptimization(); @@ -137,13 +143,13 @@ void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); -void initializeX86ExpandPseudoPass(PassRegistry&); void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); +void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); +void initializeX86OptimizeLEAPassPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); - } // End llvm namespace #endif diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 3112f00c91f2..d8631aca2734 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -95,7 +95,8 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", "Support 64-bit instructions">; def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", - "64-bit with cmpxchg16b">; + "64-bit with cmpxchg16b", + [FeatureCMPXCHG8B]>; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", @@ -240,8 +241,11 @@ def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true", "Enable Cache Demote">; def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true", "Support ptwrite instruction">; -def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", - "Support MPX instructions">; +// FIXME: This feature is deprecated in 10.0 and should not be used for +// anything, but removing it would break IR files that may contain it in a +// target-feature attribute. +def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false", + "Deprecated. Support MPX instructions">; def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", @@ -374,6 +378,10 @@ def FeatureHasFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast">; +def FeaturePrefer128Bit + : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true", + "Prefer 128-bit AVX instructions">; + def FeaturePrefer256Bit : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", "Prefer 256-bit AVX instructions">; @@ -449,6 +457,10 @@ def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "Merge branches to a three-way " "conditional branch">; +// Enable use of alias analysis during code generation. +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + // Bonnell def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">; // Silvermont @@ -579,7 +591,6 @@ def ProcessorFeatures { // Skylake list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES, - FeatureMPX, FeatureXSAVEC, FeatureXSAVES, FeatureCLFLUSHOPT, @@ -594,6 +605,7 @@ def ProcessorFeatures { // Skylake-AVX512 list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512, + FeaturePrefer256Bit, FeatureCDI, FeatureDQI, FeatureBWI, @@ -627,6 +639,7 @@ def ProcessorFeatures { // Cannonlake list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512, + FeaturePrefer256Bit, FeatureCDI, FeatureDQI, FeatureBWI, @@ -665,6 +678,17 @@ def ProcessorFeatures { list<SubtargetFeature> ICXFeatures = !listconcat(ICLInheritableFeatures, ICXSpecificFeatures); + //Tigerlake + list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT, + FeatureMOVDIRI, + FeatureMOVDIR64B, + FeatureSHSTK]; + list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather]; + list<SubtargetFeature> TGLInheritableFeatures = + !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures); + list<SubtargetFeature> TGLFeatures = + !listconcat(ICLFeatures, TGLInheritableFeatures ); + // Atom list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87, FeatureCMPXCHG8B, @@ -707,7 +731,6 @@ def ProcessorFeatures { // Goldmont list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES, - FeatureMPX, FeatureSHA, FeatureRDSEED, FeatureXSAVE, @@ -786,6 +809,22 @@ def ProcessorFeatures { list<SubtargetFeature> KNMFeatures = !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]); + // Barcelona + list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureSSE4A, + Feature3DNowA, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureLZCNT, + FeaturePOPCNT, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureCMOV, + Feature64Bit, + FeatureFastScalarShiftMasks]; + list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures; // Bobcat list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87, @@ -1093,6 +1132,8 @@ def : ProcessorModel<"icelake-client", SkylakeServerModel, ProcessorFeatures.ICLFeatures>; def : ProcessorModel<"icelake-server", SkylakeServerModel, ProcessorFeatures.ICXFeatures>; +def : ProcessorModel<"tigerlake", SkylakeServerModel, + ProcessorFeatures.TGLFeatures>; // AMD CPUs. @@ -1129,10 +1170,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { } foreach P = ["amdfam10", "barcelona"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA, - FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV, - Feature64Bit, FeatureFastScalarShiftMasks]>; + def : Proc<P, ProcessorFeatures.BarcelonaFeatures>; } // Bobcat diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 80120722e0e6..8d27be30a277 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -242,7 +242,7 @@ void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, return PrintOperand(MI, OpNo, O); if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT) O << '%'; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) { unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : (strcmp(Modifier+6,"32") == 0) ? 32 : @@ -388,7 +388,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); bool EmitPercent = true; if (!X86::GR8RegClass.contains(Reg) && @@ -575,7 +575,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { // Emitting note header. int WordSize = TT.isArch64Bit() ? 8 : 4; - EmitAlignment(WordSize == 4 ? 2 : 3); + EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0" OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/); @@ -585,7 +585,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4); OutStreamer->EmitIntValue(4, 4); // data size OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data - EmitAlignment(WordSize == 4 ? 2 : 3); // padding + EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding OutStreamer->endSection(Nt); OutStreamer->SwitchSection(Cur); diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index 3dcc1015dc7c..69c6b3356cbb 100644 --- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -35,6 +35,7 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -390,7 +391,7 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, MachineMemOperand *LMMO = *LoadInst->memoperands_begin(); MachineMemOperand *SMMO = *StoreInst->memoperands_begin(); - unsigned Reg1 = MRI->createVirtualRegister( + Register Reg1 = MRI->createVirtualRegister( TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent()))); MachineInstr *NewLoad = BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), diff --git a/lib/Target/X86/X86AvoidTrailingCall.cpp b/lib/Target/X86/X86AvoidTrailingCall.cpp new file mode 100644 index 000000000000..fb4f9e2901dc --- /dev/null +++ b/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -0,0 +1,108 @@ +//===----- X86AvoidTrailingCall.cpp - Insert int3 after trailing calls ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The Windows x64 unwinder has trouble unwinding the stack when a return +// address points to the end of the function. This pass maintains the invariant +// that every return address is inside the bounds of its parent function or +// funclet by inserting int3 if the last instruction would otherwise be a call. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +#define DEBUG_TYPE "x86-avoid-trailing-call" + +using namespace llvm; + +namespace { + +class X86AvoidTrailingCallPass : public MachineFunctionPass { +public: + X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + StringRef getPassName() const override { + return "X86 avoid trailing call pass"; + } + static char ID; +}; + +char X86AvoidTrailingCallPass::ID = 0; + +} // end anonymous namespace + +FunctionPass *llvm::createX86AvoidTrailingCallPass() { + return new X86AvoidTrailingCallPass(); +} + +// A real instruction is a non-meta, non-pseudo instruction. Some pseudos +// expand to nothing, and some expand to code. This logic conservatively assumes +// they might expand to nothing. +static bool isRealInstruction(MachineInstr &MI) { + return !MI.isPseudo() && !MI.isMetaInstruction(); +} + +// Return true if this is a call instruction, but not a tail call. +static bool isCallInstruction(const MachineInstr &MI) { + return MI.isCall() && !MI.isReturn(); +} + +bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86InstrInfo &TII = *STI.getInstrInfo(); + assert(STI.isTargetWin64() && "pass only runs on Win64"); + + // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops + // before epilogues. + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + // Look for basic blocks that precede funclet entries or are at the end of + // the function. + MachineBasicBlock *NextMBB = MBB.getNextNode(); + if (NextMBB && !NextMBB->isEHFuncletEntry()) + continue; + + // Find the last real instruction in this block, or previous blocks if this + // block is empty. + MachineBasicBlock::reverse_iterator LastRealInstr; + for (MachineBasicBlock &RMBB : + make_range(MBB.getReverseIterator(), MF.rend())) { + LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction); + if (LastRealInstr != RMBB.rend()) + break; + } + + // Do nothing if this function or funclet has no instructions. + if (LastRealInstr == MF.begin()->rend()) + continue; + + // If this is a call instruction, insert int3 right after it with the same + // DebugLoc. Convert back to a forward iterator and advance the insertion + // position once. + if (isCallInstruction(*LastRealInstr)) { + LLVM_DEBUG({ + dbgs() << "inserting int3 after trailing call instruction:\n"; + LastRealInstr->dump(); + dbgs() << '\n'; + }); + + MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse()); + BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(), + TII.get(X86::INT3)); + Changed = true; + } + } + + return Changed; +} diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 4df849a2e14c..ad7e32b4efc8 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -155,12 +155,22 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // This is bad, and breaks SP adjustment. // So, check that all of the frames in the function are closed inside // the same block, and, for good measure, that there are no nested frames. + // + // If any call allocates more argument stack memory than the stack + // probe size, don't do this optimization. Otherwise, this pass + // would need to synthesize additional stack probe calls to allocate + // memory for arguments. unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + bool UseStackProbe = + !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty(); + unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { if (MI.getOpcode() == FrameSetupOpcode) { + if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe) + return false; if (InsideFrameSequence) return false; InsideFrameSequence = true; @@ -325,8 +335,8 @@ X86CallFrameOptimization::classifyInstruction( for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; - unsigned int Reg = MO.getReg(); - if (!RegInfo.isPhysicalRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isPhysicalRegister(Reg)) continue; if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister())) return Exit; @@ -370,7 +380,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, while (I->getOpcode() == X86::LEA32r || I->isDebugInstr()) ++I; - unsigned StackPtr = RegInfo.getStackRegister(); + Register StackPtr = RegInfo.getStackRegister(); auto StackPtrCopyInst = MBB.end(); // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual // register. If it's there, use that virtual register as stack pointer @@ -443,8 +453,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, for (const MachineOperand &MO : I->uses()) { if (!MO.isReg()) continue; - unsigned int Reg = MO.getReg(); - if (RegInfo.isPhysicalRegister(Reg)) + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) UsedRegs.insert(Reg); } } @@ -524,12 +534,12 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, break; case X86::MOV32mr: case X86::MOV64mr: { - unsigned int Reg = PushOp.getReg(); + Register Reg = PushOp.getReg(); // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg // in preparation for the PUSH64. The upper 32 bits can be undef. if (Is64Bit && Store->getOpcode() == X86::MOV32mr) { - unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass); Reg = MRI->createVirtualRegister(&X86::GR64RegClass); BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg); BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg) @@ -598,7 +608,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( // movl %eax, (%esp) // call // Get rid of those with prejudice. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return nullptr; // Make sure this is the only use of Reg. diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index b16b3839c85a..7ee637cfd523 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -102,6 +102,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { DL(MIRBuilder.getMF().getDataLayout()), STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {} + bool isIncomingArgumentHandler() const override { return false; } + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0)); @@ -155,8 +157,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, CCState &State) override { - bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, + CCState &State) override { + bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); StackSize = State.getNextStackOffset(); static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, @@ -229,7 +232,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { : ValueHandler(MIRBuilder, MRI, AssignFn), DL(MIRBuilder.getMF().getDataLayout()) {} - bool isArgumentHandler() const override { return true; } + bool isIncomingArgumentHandler() const override { return true; } Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -237,7 +240,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - unsigned AddrReg = MRI.createGenericVirtualRegister( + Register AddrReg = MRI.createGenericVirtualRegister( LLT::pointer(0, DL.getPointerSizeInBits(0))); MIRBuilder.buildFrameIndex(AddrReg, FI); return AddrReg; @@ -301,6 +304,7 @@ struct FormalArgHandler : public IncomingValueHandler { : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -372,10 +376,7 @@ bool X86CallLowering::lowerFormalArguments( } bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const { + CallLoweringInfo &Info) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -385,8 +386,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, auto TRI = STI.getRegisterInfo(); // Handle only Linux C, X86_64_SysV calling conventions for now. - if (!STI.isTargetLinux() || - !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV)) + if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C || + Info.CallConv == CallingConv::X86_64_SysV)) return false; unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); @@ -395,18 +396,19 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. bool Is64Bit = STI.is64Bit(); - unsigned CallOpc = Callee.isReg() + unsigned CallOpc = Info.Callee.isReg() ? (Is64Bit ? X86::CALL64r : X86::CALL32r) : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); - auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask( - TRI->getCallPreservedMask(MF, CallConv)); + auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc) + .add(Info.Callee) + .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv)); SmallVector<ArgInfo, 8> SplitArgs; - for (const auto &OrigArg : OrigArgs) { + for (const auto &OrigArg : Info.OrigArgs) { // TODO: handle not simple cases. - if (OrigArg.Flags.isByVal()) + if (OrigArg.Flags[0].isByVal()) return false; if (OrigArg.Regs.size() > 1) @@ -423,8 +425,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; - bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed; - if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) { + bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed; + if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -445,23 +447,24 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. - if (Callee.isReg()) + if (Info.Callee.isReg()) MIB->getOperand(0).setReg(constrainOperandRegClass( MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0)); + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. - if (!OrigRet.Ty->isVoidTy()) { - if (OrigRet.Regs.size() > 1) + if (!Info.OrigRet.Ty->isVoidTy()) { + if (Info.OrigRet.Regs.size() > 1) return false; SplitArgs.clear(); SmallVector<Register, 8> NewRegs; - if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI, + if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI, [&](ArrayRef<Register> Regs) { NewRegs.assign(Regs.begin(), Regs.end()); })) @@ -472,7 +475,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; if (!NewRegs.empty()) - MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs); + MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs); } CallSeqStart.addImm(Handler.getStackSize()) diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index 0445331bc3ff..444a0c7d0122 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -34,9 +34,8 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; private: /// A function of this type is used to perform value split action. diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 1c3034a5116a..4c49d68bec99 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -433,6 +433,7 @@ defm X86_SysV64_RegCall : def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, @@ -1000,6 +1001,7 @@ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, + CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>, diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp index a61fa3246f09..5123853f5455 100644 --- a/lib/Target/X86/X86CmovConversion.cpp +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -436,8 +436,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( // Checks for "isUse()" as "uses()" returns also implicit definitions. if (!MO.isReg() || !MO.isUse()) continue; - unsigned Reg = MO.getReg(); - auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)]; + Register Reg = MO.getReg(); + auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)]; if (MachineInstr *DefMI = RDM.lookup(Reg)) { OperandToDefMap[&MO] = DefMI; DepthInfo Info = DepthMap.lookup(DefMI); @@ -456,8 +456,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned Reg = MO.getReg(); - RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI; + Register Reg = MO.getReg(); + RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI; } unsigned Latency = TSchedModel.computeInstrLatency(&MI); @@ -710,7 +710,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // Skip any CMOVs in this group which don't load from memory. if (!MI.mayLoad()) { // Remember the false-side register input. - unsigned FalseReg = + Register FalseReg = MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg(); // Walk back through any intermediate cmovs referenced. while (true) { @@ -753,7 +753,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // Get a fresh register to use as the destination of the MOV. const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg()); - unsigned TmpReg = MRI->createVirtualRegister(RC); + Register TmpReg = MRI->createVirtualRegister(RC); SmallVector<MachineInstr *, 4> NewMIs; bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg, @@ -810,9 +810,9 @@ void X86CmovConverterPass::convertCmovInstsToBranches( DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned Op1Reg = MIIt->getOperand(1).getReg(); - unsigned Op2Reg = MIIt->getOperand(2).getReg(); + Register DestReg = MIIt->getOperand(0).getReg(); + Register Op1Reg = MIIt->getOperand(1).getReg(); + Register Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are processing is the opposite condition from the jump we // generated, then we have to swap the operands for the PHI that is going to diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp index 9dea94f1368d..1bf2d5ba7b8f 100644 --- a/lib/Target/X86/X86CondBrFolding.cpp +++ b/lib/Target/X86/X86CondBrFolding.cpp @@ -564,7 +564,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) { Modified = false; break; } - return llvm::make_unique<TargetMBBInfo>(TargetMBBInfo{ + return std::make_unique<TargetMBBInfo>(TargetMBBInfo{ TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly}); } diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index 18bbfa32e11b..b4cf5cafbc6e 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -182,7 +182,7 @@ public: MachineBasicBlock *MBB = MI->getParent(); auto &DL = MI->getDebugLoc(); - unsigned Reg = MRI->createVirtualRegister( + Register Reg = MRI->createVirtualRegister( TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), *MBB->getParent())); MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); @@ -219,13 +219,13 @@ public: // Don't allow copies to/flow GR8/GR16 physical registers. // FIXME: Is there some better way to support this? - unsigned DstReg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg) && + Register DstReg = MI->getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg) && (X86::GR8RegClass.contains(DstReg) || X86::GR16RegClass.contains(DstReg))) return false; - unsigned SrcReg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + Register SrcReg = MI->getOperand(1).getReg(); + if (Register::isPhysicalRegister(SrcReg) && (X86::GR8RegClass.contains(SrcReg) || X86::GR16RegClass.contains(SrcReg))) return false; @@ -241,7 +241,7 @@ public: // Physical registers will not be converted. Assume that converting the // COPY to the destination domain will eventually result in a actual // instruction. - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) return 1; RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()), @@ -436,7 +436,7 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg, if (EnclosedEdges.count(Reg)) return; - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return; if (!MRI->hasOneDef(Reg)) @@ -593,8 +593,8 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { if (!DefOp.isReg()) continue; - unsigned DefReg = DefOp.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { + Register DefReg = DefOp.getReg(); + if (!Register::isVirtualRegister(DefReg)) { C.setAllIllegal(); continue; } @@ -751,7 +751,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { // Go over all virtual registers and calculate a closure. unsigned ClosureID = 0; for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx); + unsigned Reg = Register::index2VirtReg(Idx); // GPR only current source domain supported. if (!isGPR(MRI->getRegClass(Reg))) diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp index 58680f1815bb..24c8e6d6f6eb 100755 --- a/lib/Target/X86/X86EvexToVex.cpp +++ b/lib/Target/X86/X86EvexToVex.cpp @@ -131,7 +131,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) && "ZMM instructions should not be in the EVEX->VEX tables"); diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index b8624b40f2f7..9126a1fbea52 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -194,7 +194,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case X86::TCRETURNmi64: { bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; MachineOperand &JumpTarget = MBBI->getOperand(0); - MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands + : 1); assert(StackAdjust.isImm() && "Expecting immediate value."); // Adjust stack pointer. @@ -259,7 +260,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, ? X86::TAILJMPm : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); - for (unsigned i = 0; i != 5; ++i) + for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { JumpTarget.setIsKill(); @@ -274,7 +275,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); - MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI); + MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); @@ -287,7 +288,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, assert(DestAddr.isReg() && "Offset should be in register!"); const bool Uses64BitFramePtr = STI->isTarget64BitLP64() || STI->isTargetNaCl64(); - unsigned StackPtr = TRI->getStackRegister(); + Register StackPtr = TRI->getStackRegister(); BuildMI(MBB, MBBI, DL, TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) .addReg(DestAddr.getReg()); @@ -347,7 +348,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // actualcmpxchg Addr // [E|R]BX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); - unsigned SaveRbx = MBBI->getOperand(7).getReg(); + Register SaveRbx = MBBI->getOperand(7).getReg(); unsigned ActualInArg = Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 7b9ce0271205..e5e089d07d55 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1160,6 +1160,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && + CC != CallingConv::Tail && CC != CallingConv::X86_FastCall && CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && @@ -1173,7 +1174,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || + CC == CallingConv::Tail) return false; // Let SDISel handle vararg functions. @@ -1241,7 +1243,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { } // Make the copy. - unsigned DstReg = VA.getLocReg(); + Register DstReg = VA.getLocReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) @@ -3157,7 +3159,7 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; if (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE) + CC == CallingConv::HiPE || CC == CallingConv::Tail) return 0; if (CS) @@ -3208,6 +3210,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { default: return false; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Tail: case CallingConv::WebKit_JS: case CallingConv::Swift: case CallingConv::X86_FastCall: @@ -3224,7 +3227,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || + CC == CallingConv::Tail) return false; // Don't know how to handle Win64 varargs yet. Nothing special needed for @@ -3387,6 +3391,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { case CCValAssign::SExtUpper: case CCValAssign::ZExtUpper: case CCValAssign::FPExt: + case CCValAssign::Trunc: llvm_unreachable("Unexpected loc info!"); case CCValAssign::Indirect: // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully @@ -3547,7 +3552,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; - unsigned SrcReg = VA.getLocReg(); + Register SrcReg = VA.getLocReg(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index bf541d933790..9f7c4afde760 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -80,7 +80,7 @@ class FixupBWInstPass : public MachineFunctionPass { /// destination register of the MachineInstr passed in. It returns true if /// that super register is dead just prior to \p OrigMI, and false if not. bool getSuperRegDestIfDead(MachineInstr *OrigMI, - unsigned &SuperDestReg) const; + Register &SuperDestReg) const; /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit /// register if it is safe to do so. Return the replacement instruction if @@ -92,6 +92,12 @@ class FixupBWInstPass : public MachineFunctionPass { /// nullptr. MachineInstr *tryReplaceCopy(MachineInstr *MI) const; + /// Change the MachineInstr \p MI into the equivalent extend to 32 bit + /// register if it is safe to do so. Return the replacement instruction if + /// OK, otherwise return nullptr. + MachineInstr *tryReplaceExtend(unsigned New32BitOpcode, + MachineInstr *MI) const; + // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if // possible. Return the replacement instruction if OK, return nullptr // otherwise. @@ -169,10 +175,10 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { /// /// If so, return that super register in \p SuperDestReg. bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, - unsigned &SuperDestReg) const { + Register &SuperDestReg) const { auto *TRI = &TII->getRegisterInfo(); - unsigned OrigDestReg = OrigMI->getOperand(0).getReg(); + Register OrigDestReg = OrigMI->getOperand(0).getReg(); SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32); const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg); @@ -232,12 +238,12 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, // %ax = KILL %ax, implicit killed %eax // RET 0, %ax unsigned Opc = OrigMI->getOpcode(); (void)Opc; - // These are the opcodes currently handled by the pass, if something - // else will be added we need to ensure that new opcode has the same - // properties. - assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr || - Opc == X86::MOV16rr) && - "Unexpected opcode."); + // These are the opcodes currently known to work with the code below, if + // something // else will be added we need to ensure that new opcode has the + // same properties. + if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr && + Opc != X86::MOV16rr) + return false; bool IsDefined = false; for (auto &MO: OrigMI->implicit_operands()) { @@ -247,7 +253,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!"); if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg())) - IsDefined = true; + IsDefined = true; // If MO is a use of any part of the destination register but is not equal // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg. @@ -268,7 +274,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const { - unsigned NewDestReg; + Register NewDestReg; // We are going to try to rewrite this load to a larger zero-extending // load. This is safe if all portions of the 32 bit super-register @@ -295,11 +301,11 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const { auto &OldDest = MI->getOperand(0); auto &OldSrc = MI->getOperand(1); - unsigned NewDestReg; + Register NewDestReg; if (!getSuperRegDestIfDead(MI, NewDestReg)) return nullptr; - unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32); + Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32); // This is only correct if we access the same subregister index: otherwise, // we could try to replace "movb %ah, %al" with "movl %eax, %eax". @@ -326,6 +332,33 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const { return MIB; } +MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode, + MachineInstr *MI) const { + Register NewDestReg; + if (!getSuperRegDestIfDead(MI, NewDestReg)) + return nullptr; + + // Don't interfere with formation of CBW instructions which should be a + // shorter encoding than even the MOVSX32rr8. It's also immunte to partial + // merge issues on Intel CPUs. + if (MI->getOpcode() == X86::MOVSX16rr8 && + MI->getOperand(0).getReg() == X86::AX && + MI->getOperand(1).getReg() == X86::AL) + return nullptr; + + // Safe to change the instruction. + MachineInstrBuilder MIB = + BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg); + + unsigned NumArgs = MI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) + MIB.add(MI->getOperand(i)); + + MIB.setMemRefs(MI->memoperands()); + + return MIB; +} + MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const { // See if this is an instruction of the type we are currently looking for. @@ -355,6 +388,15 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, // of the register. return tryReplaceCopy(MI); + case X86::MOVSX16rr8: + return tryReplaceExtend(X86::MOVSX32rr8, MI); + case X86::MOVSX16rm8: + return tryReplaceExtend(X86::MOVSX32rm8, MI); + case X86::MOVZX16rr8: + return tryReplaceExtend(X86::MOVZX32rr8, MI); + case X86::MOVZX16rm8: + return tryReplaceExtend(X86::MOVZX32rm8, MI); + default: // nothing to do here. break; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 041529a0be68..543dc8b00fa0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass { /// - LEA that uses RIP relative addressing mode /// - LEA that uses 16-bit addressing mode " /// This function currently handles the first 2 cases only. - MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB); + void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec); /// Look for LEAs that are really two address LEAs that we might be able to /// turn into regular ADD instructions. @@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) continue; - if (IsSlowLEA) { + if (IsSlowLEA) processInstructionForSlowLEA(I, MBB); - } else if (IsSlow3OpsLEA) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) { - MBB.erase(I); - I = NewMI; - } - } + else if (IsSlow3OpsLEA) + processInstrForSlow3OpLEA(I, MBB, OptIncDec); } // Second pass for creating LEAs. This may reverse some of the @@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) { Reg == X86::R13D || Reg == X86::R13; } -static inline bool isRegOperand(const MachineOperand &Op) { - return Op.isReg() && Op.getReg() != X86::NoRegister; -} - /// Returns true if this LEA uses base an index registers, and the base register /// is known to be inefficient for the subtarget. // TODO: use a variant scheduling class to model the latency profile // of LEA instructions, and implement this logic as a scheduling predicate. static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, const MachineOperand &Index) { - return Base.isReg() && isInefficientLEAReg(Base.getReg()) && - isRegOperand(Index); + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() && + Index.getReg() != X86::NoRegister; } static inline bool hasLEAOffset(const MachineOperand &Offset) { @@ -372,9 +364,9 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, !TII->isSafeToClobberEFLAGS(MBB, I)) return false; - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned BaseReg = Base.getReg(); - unsigned IndexReg = Index.getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register BaseReg = Base.getReg(); + Register IndexReg = Index.getReg(); // Don't change stack adjustment LEAs. if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP)) @@ -500,9 +492,9 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, if (Segment.getReg() != 0 || !Offset.isImm() || !TII->isSafeToClobberEFLAGS(MBB, I)) return; - const unsigned DstR = Dst.getReg(); - const unsigned SrcR1 = Base.getReg(); - const unsigned SrcR2 = Index.getReg(); + const Register DstR = Dst.getReg(); + const Register SrcR1 = Base.getReg(); + const Register SrcR2 = Index.getReg(); if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR)) return; if (Scale.getImm() > 1) @@ -534,111 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, } } -MachineInstr * -FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB) { +void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, + bool OptIncDec) { + MachineInstr &MI = *I; const unsigned LEAOpcode = MI.getOpcode(); - const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt); const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg); const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp); const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); - if (!(TII->isThreeOperandsLEA(MI) || - hasInefficientLEABaseReg(Base, Index)) || + if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || !TII->isSafeToClobberEFLAGS(MBB, MI) || Segment.getReg() != X86::NoRegister) - return nullptr; + return; + + Register DestReg = Dest.getReg(); + Register BaseReg = Base.getReg(); + Register IndexReg = Index.getReg(); + + if (MI.getOpcode() == X86::LEA64_32r) { + if (BaseReg != 0) + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + if (IndexReg != 0) + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); + } - unsigned DstR = Dst.getReg(); - unsigned BaseR = Base.getReg(); - unsigned IndexR = Index.getReg(); - unsigned SSDstR = - (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; bool IsScale1 = Scale.getImm() == 1; - bool IsInefficientBase = isInefficientLEAReg(BaseR); - bool IsInefficientIndex = isInefficientLEAReg(IndexR); + bool IsInefficientBase = isInefficientLEAReg(BaseReg); + bool IsInefficientIndex = isInefficientLEAReg(IndexReg); // Skip these cases since it takes more than 2 instructions // to replace the LEA instruction. - if (IsInefficientBase && SSDstR == BaseR && !IsScale1) - return nullptr; - if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && - (IsInefficientIndex || !IsScale1)) - return nullptr; - - const DebugLoc DL = MI.getDebugLoc(); - const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); - const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + if (IsInefficientBase && DestReg == BaseReg && !IsScale1) + return; LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";); + MachineInstr *NewMI = nullptr; + // First try to replace LEA with one or two (for the 3-op LEA case) // add instructions: // 1.lea (%base,%index,1), %base => add %index,%base // 2.lea (%base,%index,1), %index => add %base,%index - if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { - const MachineOperand &Src = DstR == BaseR ? Index : Base; - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); - LLVM_DEBUG(NewMI->dump();); - // Create ADD instruction for the Offset in case of 3-Ops LEA. - if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) { + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + if (DestReg != BaseReg) + std::swap(BaseReg, IndexReg); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg) + .addReg(Base.getReg(), RegState::Implicit) + .addReg(Index.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg); } - return NewMI; - } - // If the base is inefficient try switching the index and base operands, - // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: - // lea offset(%base,%index,scale),%dst => - // lea (%base,%index,scale); add offset,%dst - if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .add(IsInefficientBase ? Index : Base) - .add(Scale) - .add(IsInefficientBase ? Base : Index) - .addImm(0) - .add(Segment); + } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); LLVM_DEBUG(NewMI->dump();); + } + + // If either replacement succeeded above, add the offset if needed, then + // replace the instruction. + if (NewMI) { // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (OptIncDec && Offset.isImm() && + (Offset.getImm() == 1 || Offset.getImm() == -1)) { + unsigned NewOpc = + getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg); + LLVM_DEBUG(NewMI->dump();); + } else { + unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Offset); + LLVM_DEBUG(NewMI->dump();); + } } - return NewMI; + + MBB.erase(I); + I = NewMI; + return; } + // Handle the rest of the cases with inefficient base register: - assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!"); assert(IsInefficientBase && "efficient base should be handled already!"); + // FIXME: Handle LEA64_32r. + if (LEAOpcode == X86::LEA64_32r) + return; + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst if (IsScale1 && !hasLEAOffset(Offset)) { - bool BIK = Base.isKill() && BaseR != IndexR; - TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK); + bool BIK = Base.isKill() && BaseReg != IndexReg; + TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK); LLVM_DEBUG(MI.getPrevNode()->dump();); - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Index); LLVM_DEBUG(NewMI->dump();); - return NewMI; + return; } + // lea offset(%base,%index,scale), %dst => // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .addReg(0) - .add(Scale) - .add(Index) - .add(Offset) - .add(Segment); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); LLVM_DEBUG(NewMI->dump();); - NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Base); LLVM_DEBUG(NewMI->dump();); - return NewMI; + + MBB.erase(I); + I = NewMI; } diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp index e2d4d1ede6f3..cbde280aa280 100644 --- a/lib/Target/X86/X86FixupSetCC.cpp +++ b/lib/Target/X86/X86FixupSetCC.cpp @@ -136,8 +136,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass; - unsigned ZeroReg = MRI->createVirtualRegister(RC); - unsigned InsertReg = MRI->createVirtualRegister(RC); + Register ZeroReg = MRI->createVirtualRegister(RC); + Register InsertReg = MRI->createVirtualRegister(RC); // Initialize a register with 0. This must go before the eflags def BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0), diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp index 5ce3255ea96a..cfba06fb6533 100644 --- a/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -721,8 +721,9 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { X86::CondCode Cond = X86::getCondFromSETCC(MI); - if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && - TRI->isVirtualRegister(MI.getOperand(0).getReg())) { + if (Cond != X86::COND_INVALID && !MI.mayStore() && + MI.getOperand(0).isReg() && + Register::isVirtualRegister(MI.getOperand(0).getReg())) { assert(MI.getOperand(0).isDef() && "A non-storing SETcc should always define a register!"); CondRegs[Cond] = MI.getOperand(0).getReg(); @@ -739,7 +740,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( unsigned X86FlagsCopyLoweringPass::promoteCondToReg( MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, X86::CondCode Cond) { - unsigned Reg = MRI->createVirtualRegister(PromoteRC); + Register Reg = MRI->createVirtualRegister(PromoteRC); auto SetI = BuildMI(TestMBB, TestPos, TestLoc, TII->get(X86::SETCCr), Reg).addImm(Cond); (void)SetI; @@ -813,7 +814,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic( MachineBasicBlock &MBB = *MI.getParent(); // Insert an instruction that will set the flag back to the desired value. - unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); + Register TmpReg = MRI->createVirtualRegister(PromoteRC); auto AddI = BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) .addDef(TmpReg, RegState::Dead) @@ -974,7 +975,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( // Now we need to turn this into a bitmask. We do this by subtracting it from // zero. - unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); ZeroReg = AdjustReg(ZeroReg); @@ -999,7 +1000,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( default: llvm_unreachable("Invalid SETB_C* opcode!"); } - unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); + Register ResultReg = MRI->createVirtualRegister(&SetBRC); BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) .addReg(ZeroReg) .addReg(ExtCondReg); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 074cf21d03f5..fcfb5bc91314 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -288,8 +288,8 @@ namespace { // Check if a COPY instruction is using FP registers. static bool isFPCopy(MachineInstr &MI) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); return X86::RFP80RegClass.contains(DstReg) || X86::RFP80RegClass.contains(SrcReg); @@ -313,7 +313,7 @@ FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } /// For example, this returns 3 for X86::FP3. static unsigned getFPReg(const MachineOperand &MO) { assert(MO.isReg() && "Expected an FP register!"); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!"); return Reg - X86::FP0; } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index e310fe069117..1b469a814adc 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -35,8 +35,8 @@ using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, - unsigned StackAlignOverride) - : TargetFrameLowering(StackGrowsDown, StackAlignOverride, + MaybeAlign StackAlignOverride) + : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(), STI.is64Bit() ? -8 : -4), STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { // Cache a bunch of frame-related predicates for this subtarget. @@ -176,7 +176,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineOperand &MO = MBBI->getOperand(i); if (!MO.isReg() || MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) @@ -216,7 +216,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg != X86::EFLAGS) continue; @@ -995,11 +995,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO; bool NeedsDwarfCFI = !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry()); - unsigned FramePtr = TRI->getFrameRegister(MF); - const unsigned MachineFramePtr = + Register FramePtr = TRI->getFrameRegister(MF); + const Register MachineFramePtr = STI.isTarget64BitILP32() - ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; - unsigned BasePtr = TRI->getBaseRegister(); + ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; + Register BasePtr = TRI->getBaseRegister(); bool HasWinCFI = false; // Debug location must be unknown since the first debug location is used @@ -1016,14 +1016,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); - - // The default stack probe size is 4096 if the function has no stackprobesize - // attribute. - unsigned StackProbeSize = 4096; - if (Fn.hasFnAttribute("stack-probe-size")) - Fn.getFnAttribute("stack-probe-size") - .getValueAsString() - .getAsInteger(0, StackProbeSize); + unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is // used and an error code was pushed, since the x86-64 ABI requires a 16-byte @@ -1081,7 +1074,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int stackGrowth = -SlotSize; // Find the funclet establisher parameter - unsigned Establisher = X86::NoRegister; + Register Establisher = X86::NoRegister; if (IsClrFunclet) Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; else if (IsFunclet) @@ -1192,7 +1185,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; - unsigned Reg = MBBI->getOperand(0).getReg(); + Register Reg = MBBI->getOperand(0).getReg(); ++MBBI; if (!HasFP && NeedsDwarfCFI) { @@ -1396,9 +1389,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { + int Offset; unsigned IgnoredFrameReg; - int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); - Offset += SEHFrameOffset; + if (IsWin64Prologue && IsFunclet) + Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg); + else + Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) + + SEHFrameOffset; HasWinCFI = true; assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); @@ -1554,9 +1551,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { unsigned X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); // This is the size of the pushed CSRs. - unsigned CSSize = - MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + // This is the size of callee saved XMMs. + const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + unsigned XMMSize = WinEHXMMSlotInfo.size() * + TRI->getSpillSize(X86::VR128RegClass); // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = @@ -1576,7 +1577,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. - return FrameSizeMinusRBP - CSSize; + return FrameSizeMinusRBP + XMMSize - CSSize; } static bool isTailCallOpcode(unsigned Opc) { @@ -1597,9 +1598,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, DL = MBBI->getDebugLoc(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Is64BitILP32 = STI.isTarget64BitILP32(); - unsigned FramePtr = TRI->getFrameRegister(MF); + Register FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = - Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; + Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWin64CFI = @@ -1850,6 +1851,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset + FPDelta; } +int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, + int FI, unsigned &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + const auto it = WinEHXMMSlotInfo.find(FI); + + if (it == WinEHXMMSlotInfo.end()) + return getFrameIndexReference(MF, FI, FrameReg); + + FrameReg = TRI->getStackRegister(); + return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; +} + int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI, unsigned &FrameReg, int Adjustment) const { @@ -1948,6 +1963,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CalleeSavedFrameSize = 0; + unsigned XMMCalleeSavedFrameSize = 0; + auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -1984,7 +2001,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. - unsigned FPReg = TRI->getFrameRegister(MF); + Register FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); @@ -2025,12 +2042,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( unsigned Size = TRI->getSpillSize(*RC); unsigned Align = TRI->getSpillAlignment(*RC); // ensure alignment - SpillSlotOffset -= std::abs(SpillSlotOffset) % Align; + assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86"); + SpillSlotOffset = -alignTo(-SpillSlotOffset, Align); + // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI.ensureMaxAlignment(Align); + + // Save the start offset and size of XMM in stack frame for funclets. + if (X86::VR128RegClass.contains(Reg)) { + WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize; + XMMCalleeSavedFrameSize += Size; + } } return true; @@ -2200,7 +2225,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, // Spill the BasePtr if it's used. if (TRI->hasBasePointer(MF)){ - unsigned BasePtr = TRI->getBaseRegister(); + Register BasePtr = TRI->getBaseRegister(); if (STI.isTarget64BitILP32()) BasePtr = getX86SubSuperRegister(BasePtr, 64); SavedRegs.set(BasePtr); @@ -2212,7 +2237,7 @@ HasNestArgument(const MachineFunction *MF) { const Function &F = MF->getFunction(); for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; I++) { - if (I->hasNestAttr()) + if (I->hasNestAttr() && !I->use_empty()) return true; } return false; @@ -2244,7 +2269,8 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || - CallingConvention == CallingConv::Fast) { + CallingConvention == CallingConv::Fast || + CallingConvention == CallingConv::Tail) { if (IsNested) report_fatal_error("Segmented stacks does not support fastcall with " "nested function."); @@ -2525,6 +2551,18 @@ static unsigned getHiPELiteral( + " required but not provided"); } +// Return true if there are no non-ehpad successors to MBB and there are no +// non-meta instructions between MBBI and MBB.end(). +static bool blockEndIsUnreachable(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator MBBI) { + return std::all_of( + MBB.succ_begin(), MBB.succ_end(), + [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) && + std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) { + return MI.isMetaInstruction(); + }); +} + /// Erlang programs may need a special prologue to handle the stack size they /// might need at runtime. That is because Erlang/OTP does not implement a C /// stack but uses a custom implementation of hybrid stack/heap architecture. @@ -2758,7 +2796,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0; + uint64_t Amount = TII.getFrameSize(*I); uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0; I = MBB.erase(I); auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); @@ -2847,7 +2885,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return I; } - if (isDestroy && InternalAmt) { + if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. @@ -2912,8 +2950,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( "restoring EBP/ESI on non-32-bit target"); MachineFunction &MF = *MBB.getParent(); - unsigned FramePtr = TRI->getFrameRegister(MF); - unsigned BasePtr = TRI->getBaseRegister(); + Register FramePtr = TRI->getFrameRegister(MF); + Register BasePtr = TRI->getBaseRegister(); WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index d32746e3a36e..2103d6471ead 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -25,7 +25,7 @@ class X86RegisterInfo; class X86FrameLowering : public TargetFrameLowering { public: - X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride); + X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride); // Cached subtarget predicates. @@ -99,6 +99,8 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getWin64EHFrameIndexRef(const MachineFunction &MF, + int FI, unsigned &SPReg) const; int getFrameIndexReferenceSP(const MachineFunction &MF, int FI, unsigned &SPReg, int Adjustment) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 95d31e62cafc..5b546d42d98a 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -253,6 +253,11 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -362,6 +367,11 @@ namespace { if (User->getNumOperands() != 2) continue; + // If this can match to INC/DEC, don't count it as a use. + if (User->getOpcode() == ISD::ADD && + (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0)))) + continue; + // Immediates that are used for offsets as part of stack // manipulation should be left alone. These are typically // used to indicate SP offsets for argument passing and @@ -502,8 +512,10 @@ namespace { bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); + bool combineIncDecVector(SDNode *Node); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); + bool tryMatchBitSelect(SDNode *N); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); @@ -746,7 +758,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { return false; LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); if (!LD || - LD->isVolatile() || + !LD->isSimple() || LD->getAddressingMode() != ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD) return false; @@ -873,10 +885,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(N); - SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, - N->getValueType(0), - N->getOperand(0), - CurDAG->getConstant(Imm, dl, MVT::i8)); + SDValue Res = CurDAG->getNode( + X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), + CurDAG->getTargetConstant(Imm, dl, MVT::i8)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; @@ -2305,10 +2316,10 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, return false; // We can allow a full vector load here since narrowing a load is ok unless - // it's volatile. + // it's volatile or atomic. if (ISD::isNON_EXTLoad(N.getNode())) { LoadSDNode *LD = cast<LoadSDNode>(N); - if (!LD->isVolatile() && + if (LD->isSimple() && IsProfitableToFold(N, LD, Root) && IsLegalToFold(N, Parent, Root, OptLevel)) { PatternNodeWithChain = N; @@ -2464,6 +2475,37 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, Complexity += 2; } + // Heuristic: try harder to form an LEA from ADD if the operands set flags. + // Unlike ADD, LEA does not affect flags, so we will be less likely to require + // duplicating flag-producing instructions later in the pipeline. + if (N.getOpcode() == ISD::ADD) { + auto isMathWithFlags = [](SDValue V) { + switch (V.getOpcode()) { + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: + /* TODO: These opcodes can be added safely, but we may want to justify + their inclusion for different reasons (better for reg-alloc). + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + */ + // Value 1 is the flag output of the node - verify it's not dead. + return !SDValue(V.getNode(), 1).use_empty(); + default: + return false; + } + }; + // TODO: This could be an 'or' rather than 'and' to make the transform more + // likely to happen. We might want to factor in whether there's a + // load folding opportunity for the math op that disappears with LEA. + if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) + Complexity++; + } + if (AM.Disp) Complexity++; @@ -2544,6 +2586,7 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { + assert(Root && P && "Unknown root/parent nodes"); if (!ISD::isNON_EXTLoad(N.getNode()) || !IsProfitableToFold(N, P, Root) || !IsLegalToFold(N, P, Root, OptLevel)) @@ -2553,6 +2596,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + assert(Root && P && "Unknown root/parent nodes"); + if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -3302,8 +3359,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { SDValue ImplDef = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); - NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef, - NBits); + + SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); + insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); + NBits = SDValue( + CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, + NBits, SRIdxVal), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); if (Subtarget->hasBMI2()) { @@ -3400,8 +3461,9 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM // hoisting the move immediate would make it worthwhile with a less optimal // BEXTR? - if (!Subtarget->hasTBM() && - !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) + bool PreferBEXTR = + Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); + if (!PreferBEXTR && !Subtarget->hasBMI2()) return nullptr; // Must have a shift right. @@ -3440,23 +3502,50 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { if (Shift + MaskSize > NVT.getSizeInBits()) return nullptr; - SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); - unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; - unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + // BZHI, if available, is always fast, unlike BEXTR. But even if we decide + // that we can't use BEXTR, it is only worthwhile using BZHI if the mask + // does not fit into 32 bits. Load folding is not a sufficient reason. + if (!PreferBEXTR && MaskSize <= 32) + return nullptr; + + SDValue Control; + unsigned ROpc, MOpc; - // BMI requires the immediate to placed in a register. - if (!Subtarget->hasTBM()) { - ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; - MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + if (!PreferBEXTR) { + assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); + // If we can't make use of BEXTR then we can't fuse shift+mask stages. + // Let's perform the mask first, and apply shift later. Note that we need to + // widen the mask to account for the fact that we'll apply shift afterwards! + Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); + ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; + MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; - New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0); + Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); + } else { + // The 'control' of BEXTR has the pattern of: + // [15...8 bit][ 7...0 bit] location + // [ bit count][ shift] name + // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 + Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + if (Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + } else { + assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); + // BMI requires the immediate to placed in a register. + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; + Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); + } } MachineSDNode *NewNode; SDValue Input = N0->getOperand(0); SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + SDValue Ops[] = { + Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. @@ -3464,7 +3553,15 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); } else { - NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New); + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); + } + + if (!PreferBEXTR) { + // We still need to apply the shift. + SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); + unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; + NewNode = + CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); } return NewNode; @@ -3735,6 +3832,52 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { return true; } +/// Convert vector increment or decrement to sub/add with an all-ones constant: +/// add X, <1, 1...> --> sub X, <-1, -1...> +/// sub X, <1, 1...> --> add X, <-1, -1...> +/// The all-ones vector constant can be materialized using a pcmpeq instruction +/// that is commonly recognized as an idiom (has no register dependency), so +/// that's better/smaller than loading a splat 1 constant. +bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) { + assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) && + "Unexpected opcode for increment/decrement transform"); + + EVT VT = Node->getValueType(0); + assert(VT.isVector() && "Should only be called for vectors."); + + SDValue X = Node->getOperand(0); + SDValue OneVec = Node->getOperand(1); + + APInt SplatVal; + if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue()) + return false; + + SDLoc DL(Node); + SDValue OneConstant, AllOnesVec; + + APInt Ones = APInt::getAllOnesValue(32); + assert(VT.getSizeInBits() % 32 == 0 && + "Expected bit count to be a multiple of 32"); + OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32); + insertDAGNode(*CurDAG, X, OneConstant); + + unsigned NumElts = VT.getSizeInBits() / 32; + assert(NumElts > 0 && "Expected to get non-empty vector."); + AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts), + DL, OneConstant); + insertDAGNode(*CurDAG, X, AllOnesVec); + + AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec); + insertDAGNode(*CurDAG, X, AllOnesVec); + + unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; + SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec); + + ReplaceNode(Node, NewNode.getNode()); + SelectCode(NewNode.getNode()); + return true; +} + /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large @@ -3975,12 +4118,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, if (CC != ISD::SETEQ && CC != ISD::SETNE) return false; - // See if we're comparing against zero. This should have been canonicalized - // to RHS during lowering. - if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode())) + SDValue SetccOp0 = Setcc.getOperand(0); + SDValue SetccOp1 = Setcc.getOperand(1); + + // Canonicalize the all zero vector to the RHS. + if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) + std::swap(SetccOp0, SetccOp1); + + // See if we're comparing against zero. + if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) return false; - SDValue N0 = Setcc.getOperand(0); + SDValue N0 = SetccOp0; MVT CmpVT = N0.getSimpleValueType(); MVT CmpSVT = CmpVT.getVectorElementType(); @@ -4027,13 +4176,14 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { // Look through single use bitcasts. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) - Src = Src.getOperand(0); - - if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { Parent = Src.getNode(); Src = Src.getOperand(0); - if (Src.getSimpleValueType() == CmpSVT) + } + + if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Src); + if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) return Src; } @@ -4045,17 +4195,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, bool FoldedBCast = false; if (!FoldedLoad && CanFoldLoads && (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { - SDNode *ParentNode = nullptr; + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); } // Try the other operand. if (!FoldedBCast) { + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedBCast) std::swap(Src0, Src1); } @@ -4125,7 +4276,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, // Update the chain. ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); @@ -4146,6 +4297,55 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, return true; } +// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it +// into vpternlog. +bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { + assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); + + MVT NVT = N->getSimpleValueType(0); + + // Make sure we support VPTERNLOG. + if (!NVT.isVector() || !Subtarget->hasAVX512()) + return false; + + // We need VLX for 128/256-bit. + if (!(Subtarget->hasVLX() || NVT.is512BitVector())) + return false; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Canonicalize AND to LHS. + if (N1.getOpcode() == ISD::AND) + std::swap(N0, N1); + + if (N0.getOpcode() != ISD::AND || + N1.getOpcode() != X86ISD::ANDNP || + !N0.hasOneUse() || !N1.hasOneUse()) + return false; + + // ANDN is not commutable, use it to pick down A and C. + SDValue A = N1.getOperand(0); + SDValue C = N1.getOperand(1); + + // AND is commutable, if one operand matches A, the other operand is B. + // Otherwise this isn't a match. + SDValue B; + if (N0.getOperand(0) == A) + B = N0.getOperand(1); + else if (N0.getOperand(1) == A) + B = N0.getOperand(0); + else + return false; + + SDLoc dl(N); + SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); + SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); + ReplaceNode(N, Ternlog.getNode()); + SelectCode(Ternlog.getNode()); + return true; +} + void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); @@ -4170,6 +4370,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned Opc = 0; switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_sse3_monitor: if (!Subtarget->hasSSE3()) break; @@ -4303,9 +4504,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (tryShrinkShlLogicImm(Node)) return; + if (Opcode == ISD::OR && tryMatchBitSelect(Node)) + return; + LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: { + if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() && + combineIncDecVector(Node)) + return; + // Try to avoid folding immediates with multiple uses for optsize. // This code tries to select to register form directly to avoid going // through the isel table which might fold the immediate. We can't change @@ -4333,6 +4541,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (!isInt<8>(Val) && !isInt<32>(Val)) break; + // If this can match to INC/DEC, let it go. + if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) + break; + // Check if we should avoid folding this immediate. if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) break; @@ -4610,7 +4822,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ClrReg = HiReg = X86::AH; - SExtOpcode = X86::CBW; + SExtOpcode = 0; // Not used. break; case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; @@ -4632,24 +4844,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) { bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; - if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) { + if (NVT == MVT::i8) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; MachineSDNode *Move; if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; - Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, - MVT::Other, Ops); + unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 + : X86::MOVZX16rm8; + Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); Chain = SDValue(Move, 1); ReplaceUses(N0.getValue(1), Chain); // Record the mem-refs CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); } else { - Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0); + unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 + : X86::MOVZX16rr8; + Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); Chain = CurDAG->getEntryNode(); } - Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0), + Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), SDValue()); InFlag = Chain.getValue(1); } else { @@ -4996,10 +5211,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(Node); - SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, - Node->getValueType(0), + SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0), Node->getOperand(0), - CurDAG->getConstant(Imm, dl, MVT::i8)); + CurDAG->getTargetConstant(Imm, dl, MVT::i8)); ReplaceNode(Node, Res.getNode()); SelectCode(Res.getNode()); return; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0b4bf687e6cf..ed975e9248a8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -65,17 +65,19 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); -static cl::opt<bool> ExperimentalVectorWideningLegalization( - "x86-experimental-vector-widening-legalization", cl::init(false), - cl::desc("Enable an experimental vector type legalization through widening " - "rather than promotion."), - cl::Hidden); - static cl::opt<int> ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), - cl::desc("Sets the preferable loop alignment for experiments " - "(the last x86-experimental-pref-loop-alignment bits" - " of the loop header PC will be 0)."), + cl::desc( + "Sets the preferable loop alignment for experiments (as log2 bytes)" + "(the last x86-experimental-pref-loop-alignment bits" + " of the loop header PC will be 0)."), + cl::Hidden); + +// Added in 10.0. +static cl::opt<bool> EnableOldKNLABI( + "x86-enable-old-knl-abi", cl::init(false), + cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " + "one ZMM register on AVX512F, but not AVX512BW targets."), cl::Hidden); static cl::opt<bool> MulConstantOptimization( @@ -84,6 +86,13 @@ static cl::opt<bool> MulConstantOptimization( "SHIFT, LEA, etc."), cl::Hidden); +static cl::opt<bool> ExperimentalUnorderedISEL( + "x86-experimental-unordered-atomic-isel", cl::init(false), + cl::desc("Use LoadSDNode and StoreSDNode instead of " + "AtomicSDNode for unordered atomic loads and " + "stores respectively."), + cl::Hidden); + /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -196,7 +205,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); } setOperationAction(ISD::ABS , MVT::i64 , Custom); @@ -214,14 +223,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); - if (Subtarget.is64Bit()) { - if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) - // f32/f64 are legal, f80 is custom. - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); - else - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); - } else if (!Subtarget.useSoftFloat()) { + if (!Subtarget.useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -277,29 +279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); - if (Subtarget.is64Bit()) { - if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { - // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); - } else { - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); - } - } else if (!Subtarget.useSoftFloat()) { - // Since AVX is a superset of SSE3, only check for SSE here. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE3()) - // Expand FP_TO_UINT into a select. - // FIXME: We would like to use a Custom expander here eventually to do - // the optimal thing for SSE vs. the default expansion in the legalizer. - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); - else - // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. - // With SSE3 we can use fisttpll to convert to a signed i64; without - // SSE, we're stuck with a fistpll. - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + if (!Subtarget.useSoftFloat()) { + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. @@ -345,11 +327,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); - setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); + setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter @@ -396,15 +378,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); + setTruncStoreAction(MVT::f128, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); @@ -638,17 +624,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87, except f128 in MMX. + // f80 always uses X87. if (UseX87) { - if (Subtarget.is64Bit() && Subtarget.hasMMX()) { - addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); - ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); - setOperationAction(ISD::FABS , MVT::f128, Custom); - setOperationAction(ISD::FNEG , MVT::f128, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); - } - addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -684,10 +661,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f80, Expand); } + // f128 uses xmm registers, but most operations require libcalls. + if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { + addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps + + setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FMA, MVT::f128, Expand); + + setOperationAction(ISD::FABS, MVT::f128, Custom); + setOperationAction(ISD::FNEG, MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FSINCOS, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + // We need to custom handle any FP_ROUND with an f128 input, but + // LegalizeDAG uses the result type to know when to run a custom handler. + // So we have to list all legal floating point result types here. + if (isTypeLegal(MVT::f32)) { + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + } + if (isTypeLegal(MVT::f64)) { + setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); + } + if (isTypeLegal(MVT::f80)) { + setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); + } + + setOperationAction(ISD::SETCC, MVT::f128, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f80, Expand); + } + // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); + setOperationAction(ISD::FPOW , MVT::f128 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); @@ -716,7 +743,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); @@ -754,7 +781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); @@ -797,6 +824,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -823,10 +852,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } setOperationAction(ISD::MUL, MVT::v2i8, Custom); - setOperationAction(ISD::MUL, MVT::v2i16, Custom); - setOperationAction(ISD::MUL, MVT::v2i32, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); - setOperationAction(ISD::MUL, MVT::v4i16, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -863,28 +889,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); - if (!ExperimentalVectorWideningLegalization) { - // Use widening instead of promotion. - for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, - MVT::v4i16, MVT::v2i16 }) { - setOperationAction(ISD::UADDSAT, VT, Custom); - setOperationAction(ISD::SADDSAT, VT, Custom); - setOperationAction(ISD::USUBSAT, VT, Custom); - setOperationAction(ISD::SSUBSAT, VT, Custom); - } - } - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Provide custom widening for v2f32 setcc. This is really for VLX when - // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to - // type legalization changing the result type to v4i1 during widening. - // It works fine for SSE2 and is probably faster so no need to qualify with - // VLX support. - setOperationAction(ISD::SETCC, MVT::v2i32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -904,19 +912,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - // We support custom legalizing of sext and anyext loads for specific - // memory vector types which we can load as a scalar (or sequence of - // scalars) and extend in-register to a legal 128-bit vector type. For sext - // loads these must work with a single scalar load. - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); - } - for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); @@ -938,7 +933,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); // Custom legalize these to avoid over promotion or custom promotion. setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); @@ -991,18 +985,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); - if (ExperimentalVectorWideningLegalization) { - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); - } else { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); - } + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. @@ -1069,22 +1059,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } - if (!ExperimentalVectorWideningLegalization) { - // Avoid narrow result types when widening. The legal types are listed - // in the next loop. - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); - } - } - // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); - if (!ExperimentalVectorWideningLegalization) - setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); @@ -1145,6 +1123,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom); + if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); @@ -1292,10 +1272,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STORE, VT, Custom); } - if (HasInt256) - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - if (HasInt256) { + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); @@ -1407,6 +1386,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); @@ -1433,12 +1414,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - if (ExperimentalVectorWideningLegalization) { - // Need to custom widen this if we don't have AVX512BW. - setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); - } + // Need to custom widen this if we don't have AVX512BW. + setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -1529,10 +1508,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } - // Need to custom split v32i16/v64i8 bitcasts. if (!Subtarget.hasBWI()) { + // Need to custom split v32i16/v64i8 bitcasts. setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); + + // Better to split these into two 256-bit ops. + setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); } if (Subtarget.hasVBMI2()) { @@ -1777,6 +1760,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSHR, VT, Custom); } } + + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } // We want to custom lower some of our intrinsics. @@ -1905,13 +1892,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). - setPrefLoopAlignment(ExperimentalPrefLoopAlignment); + setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment)); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; - setPrefFunctionAlignment(4); // 2^4 bytes. + setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); } @@ -1939,8 +1926,7 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; - if (ExperimentalVectorWideningLegalization && - VT.getVectorNumElements() != 1 && + if (VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -1950,19 +1936,62 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // v32i1 vectors should be promoted to v32i8 to match avx2. if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) + return MVT::i8; + // FIXME: Should we just make these types legal and custom split operations? + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && + Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + return MVT::v16i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // v32i1 vectors should be promoted to v32i8 to match avx2. if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) + return VT.getVectorNumElements(); + // FIXME: Should we just make these types legal and custom split operations? + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && + Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } +unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { + RegisterVT = MVT::i8; + IntermediateVT = MVT::i1; + NumIntermediates = VT.getVectorNumElements(); + return NumIntermediates; + } + + return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, + NumIntermediates, RegisterVT); +} + EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { @@ -2060,6 +2089,11 @@ EVT X86TargetLowering::getOptimalMemOpType( if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { + // FIXME: Check if unaligned 64-byte accesses are slow. + if (Size >= 64 && Subtarget.hasAVX512() && + (Subtarget.getPreferVectorWidth() >= 512)) { + return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; + } // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { @@ -2403,8 +2437,8 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, /// Breaks v64i1 value into two registers and adds the new node to the DAG static void Passv64i1ArgInRegs( - const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, - SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA, + const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, + SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); @@ -2537,7 +2571,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); - Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I], + Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I], Subtarget); assert(2 == RegsToPass.size() && @@ -2816,6 +2850,10 @@ SDValue X86TargetLowering::LowerCallResult( ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (CopyVT == MVT::f64 && + (Is64Bit && !Subtarget.hasSSE2())) { + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -2925,7 +2963,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || - CC == CallingConv::HHVM); + CC == CallingConv::HHVM || CC == CallingConv::Tail); } /// Return true if we might ever do TCO for calls with this calling convention. @@ -2951,7 +2989,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { - return GuaranteedTailCallOpt && canGuaranteeTCO(CC); + return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail; } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { @@ -3405,7 +3443,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. - if (Subtarget.hasAVX512() && + if (Subtarget.useAVX512Regs() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; @@ -3577,6 +3615,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; + bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || + CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); @@ -3597,8 +3637,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Attr.getValueAsString() == "true") isTailCall = false; - if (Subtarget.isPICStyleGOT() && - !MF.getTarget().Options.GuaranteedTailCallOpt) { + if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code @@ -3625,7 +3664,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) + if (!IsGuaranteeTCO && isTailCall) IsSibcall = true; if (isTailCall) @@ -3657,8 +3696,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (MF.getTarget().Options.GuaranteedTailCallOpt && - canGuaranteeTCO(CallConv)) + else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -3782,8 +3820,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // Split v64i1 value into two registers - Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I], - Subtarget); + Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); const TargetOptions &Options = DAG.getTarget().Options; @@ -4069,6 +4106,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + // Save heapallocsite metadata. + if (CLI.CS) + if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite")) + DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); + // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, @@ -4190,7 +4232,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!TargetRegisterInfo::isVirtualRegister(VR)) + if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -4279,6 +4321,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); + bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || + CalleeCC == CallingConv::Tail; // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this @@ -4286,7 +4330,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( if (IsCalleeWin64 != IsCallerWin64) return false; - if (DAG.getTarget().Options.GuaranteedTailCallOpt) { + if (IsGuaranteeTCO) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; @@ -4413,7 +4457,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: @@ -4652,7 +4696,11 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, // X < 0 -> X == 0, jump on sign. return X86::COND_S; } - if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) { + // X >= 0 -> X == 0, jump on !sign. + return X86::COND_NS; + } + if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; @@ -4760,7 +4808,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); - Info.align = 1; + Info.align = Align::None(); Info.flags |= MachineMemOperand::MOStore; break; } @@ -4773,7 +4821,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); - Info.align = 1; + Info.align = Align::None(); Info.flags |= MachineMemOperand::MOLoad; break; } @@ -4785,7 +4833,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); - Info.align = 1; + Info.align = Align::None(); Info.flags |= MachineMemOperand::MOStore; break; } @@ -4811,6 +4859,8 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"); + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); @@ -4852,11 +4902,12 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } -bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const { +bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { // If we are using XMM registers in the ABI and the condition of the select is // a floating-point compare and we have blendv or conditional move, then it is // cheaper to select instead of doing a cross-register move and creating a // load that depends on the compare result. + bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); } @@ -4869,15 +4920,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { return true; } -bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { +bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; + // Find the type this will be legalized too. Otherwise we might prematurely + // convert this to shl+add/sub and then still have to type legalize those ops. + // Another choice would be to defer the decision for illegal types until + // after type legalization. But constant splat vectors of i64 can't make it + // through type legalization on 32-bit targets so we would need to special + // case vXi64. + while (getTypeAction(Context, VT) != TypeLegal) + VT = getTypeToTransformTo(Context, VT); + // If vector multiply is legal, assume that's faster than shl + add/sub. - // TODO: Multiply is a complex op with higher latency and lower througput in + // TODO: Multiply is a complex op with higher latency and lower throughput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. if (isOperationLegal(ISD::MUL, VT)) @@ -5022,6 +5083,33 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const { return Subtarget.hasSSE2(); } +bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { + return X.getValueType().isScalarInteger(); // 'bt' +} + +bool X86TargetLowering:: + shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const { + // Does baseline recommend not to perform the fold by default? + if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) + return false; + // For scalars this transform is always beneficial. + if (X.getValueType().isScalarInteger()) + return true; + // If all the shift amounts are identical, then transform is beneficial even + // with rudimentary SSE2 shifts. + if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) + return true; + // If we have AVX2 with it's powerful shift operations, then it's also good. + if (Subtarget.hasAVX2()) + return true; + // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. + return NewShiftOpcode == ISD::SHL; +} + bool X86TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && @@ -5054,6 +5142,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { return true; } +bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget.isOSWindows()) + return false; + return true; +} + bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { // Any legal vector type can be splatted more efficiently than // loading/spilling from memory. @@ -5093,10 +5189,8 @@ static bool isUndefOrZero(int Val) { /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { - for (unsigned i = Pos, e = Pos + Size; i != e; ++i) - if (Mask[i] != SM_SentinelUndef) - return false; - return true; + return llvm::all_of(Mask.slice(Pos, Size), + [](int M) { return M == SM_SentinelUndef; }); } /// Return true if the mask creates a vector whose lower half is undefined. @@ -5119,10 +5213,7 @@ static bool isInRange(int Val, int Low, int Hi) { /// Return true if the value of any element in Mask falls within the specified /// range (L, H]. static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) { - for (int M : Mask) - if (isInRange(M, Low, Hi)) - return true; - return false; + return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); } /// Return true if Val is undef or if its value falls within the @@ -5133,12 +5224,9 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. -static bool isUndefOrInRange(ArrayRef<int> Mask, - int Low, int Hi) { - for (int M : Mask) - if (!isUndefOrInRange(M, Low, Hi)) - return false; - return true; +static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) { + return llvm::all_of( + Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); } /// Return true if Val is undef, zero or if its value falls within the @@ -5150,10 +5238,8 @@ static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) { - for (int M : Mask) - if (!isUndefOrZeroOrInRange(M, Low, Hi)) - return false; - return true; + return llvm::all_of( + Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); } /// Return true if every element in Mask, beginning @@ -5171,8 +5257,9 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, - unsigned Size, int Low) { - for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low) + unsigned Size, int Low, + int Step = 1) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; @@ -5182,10 +5269,8 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { - for (unsigned i = Pos, e = Pos + Size; i != e; ++i) - if (!isUndefOrZero(Mask[i])) - return false; - return true; + return llvm::all_of(Mask.slice(Pos, Size), + [](int M) { return isUndefOrZero(M); }); } /// Helper function to test whether a shuffle mask could be @@ -5357,6 +5442,8 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SDValue Vec; if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); + } else if (VT.isFloatingPoint()) { + Vec = DAG.getConstantFP(+0.0, dl, VT); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); @@ -5500,6 +5587,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && Idx == (VT.getVectorNumElements() / 2) && Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(1).getValueType() == SubVT && isNullConstant(Src.getOperand(2))) { Ops.push_back(Src.getOperand(1)); Ops.push_back(Sub); @@ -5593,7 +5681,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, WideOpVT), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } @@ -5609,14 +5697,14 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (IdxVal == 0) { // Zero lower bits of the Vec - SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); @@ -5628,7 +5716,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } @@ -5638,30 +5726,30 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, - DAG.getConstant(ShiftRight, dl, MVT::i8)); + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to opimitize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); + SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } @@ -5675,30 +5763,47 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Move the current value of the bit to be replace to the lsbs. - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); - // Xor with the new bit. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); - // Shift to MSB, filling bottom bits with 0. + + // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; - Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - // Shift to the final position, filling upper bits with 0. + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, - DAG.getConstant(ShiftRight, dl, MVT::i8)); - // Xor with original vector leaving the new value. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + + // Isolate the bits below the insertion point. + unsigned LowShift = NumElems - IdxVal; + SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + + // Isolate the bits after the last inserted bit. + unsigned HighShift = IdxVal + SubVecNumElems; + SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + + // Now OR all 3 pieces together. + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); + SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); + // Reduce to original width if needed. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } -static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - const SDLoc &dl, unsigned VectorWidth) { - SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth); - return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth); +static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, + const SDLoc &dl) { + assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch"); + EVT SubVT = V1.getValueType(); + EVT SubSVT = SubVT.getScalarType(); + unsigned SubNumElts = SubVT.getVectorNumElements(); + unsigned SubVectorWidth = SubVT.getSizeInBits(); + EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); + SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); + return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); } /// Returns a vector of specified type with all bits set. @@ -5755,6 +5860,34 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, return DAG.getNode(Opcode, DL, VT, In); } +// Match (xor X, -1) -> X. +// Match extract_subvector(xor X, -1) -> extract_subvector(X). +// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). +static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { + V = peekThroughBitcasts(V); + if (V.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + return V.getOperand(0); + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { + if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), + Not, V.getOperand(1)); + } + } + SmallVector<SDValue, 2> CatOps; + if (collectConcatOps(V.getNode(), CatOps)) { + for (SDValue &CatOp : CatOps) { + SDValue NotCat = IsNOT(CatOp, DAG); + if (!NotCat) return SDValue(); + CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); + } + return SDValue(); +} + /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { @@ -6003,6 +6136,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } } + if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && + EltSizeInBits <= VT.getScalarSizeInBits()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op); + if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) + return false; + + SDValue Ptr = MemIntr->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!CNode || CNode->isMachineConstantPoolEntry() || + CNode->getOffset() != 0) + return false; + + if (const Constant *C = CNode->getConstVal()) { + unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); + if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { + if (UndefSrcElts[0]) + UndefSrcElts.setBits(0, NumSrcElts); + SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); + return CastBitData(UndefSrcElts, SrcEltBits); + } + } + } + // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { SmallVector<APInt, 16> SubEltBits; @@ -6123,7 +6287,9 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; } -static bool isConstantSplat(SDValue Op, APInt &SplatVal) { +namespace llvm { +namespace X86 { +bool isConstantSplat(SDValue Op, APInt &SplatVal) { APInt UndefElts; SmallVector<APInt, 16> EltBits; if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), @@ -6146,6 +6312,8 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) { return false; } +} // namespace X86 +} // namespace llvm static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, @@ -6551,13 +6719,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return true; } -/// Check a target shuffle mask's inputs to see if we can set any values to -/// SM_SentinelZero - this is for elements that are known to be zero -/// (not just zeroable) from their inputs. +/// Decode a target shuffle mask and inputs and see if any values are +/// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. -static bool setTargetShuffleZeroElements(SDValue N, - SmallVectorImpl<int> &Mask, - SmallVectorImpl<SDValue> &Ops) { +static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, + SmallVectorImpl<SDValue> &Ops, + APInt &KnownUndef, APInt &KnownZero) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; @@ -6566,15 +6733,17 @@ static bool setTargetShuffleZeroElements(SDValue N, if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; + int Size = Mask.size(); SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; + KnownUndef = KnownZero = APInt::getNullValue(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type"); - unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); + unsigned EltSizeInBits = VT.getSizeInBits() / Size; // Extract known constant input data. APInt UndefSrcElts[2]; @@ -6585,12 +6754,18 @@ static bool setTargetShuffleZeroElements(SDValue N, getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], true, false)}; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { + for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. - if (M < 0) + if (M < 0) { + assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); + if (SM_SentinelUndef == M) + KnownUndef.setBit(i); + if (SM_SentinelZero == M) + KnownZero.setBit(i); continue; + } // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; @@ -6599,7 +6774,7 @@ static bool setTargetShuffleZeroElements(SDValue N, // We are referencing an UNDEF input. if (V.isUndef()) { - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); continue; } @@ -6612,31 +6787,64 @@ static bool setTargetShuffleZeroElements(SDValue N, int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) - Mask[i] = SM_SentinelZero; + KnownZero.setBit(i); continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); else if (SrcEltBits[SrcIdx][M] == 0) - Mask[i] = SM_SentinelZero; + KnownZero.setBit(i); } } - assert(VT.getVectorNumElements() == Mask.size() && + assert(VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"); return true; } +// Replace target shuffle mask elements with known undef/zero sentinels. +static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, + const APInt &KnownUndef, + const APInt &KnownZero) { + unsigned NumElts = Mask.size(); + assert(KnownUndef.getBitWidth() == NumElts && + KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); + + for (unsigned i = 0; i != NumElts; ++i) { + if (KnownUndef[i]) + Mask[i] = SM_SentinelUndef; + else if (KnownZero[i]) + Mask[i] = SM_SentinelZero; + } +} + +// Extract target shuffle mask sentinel elements to known undef/zero bitmasks. +static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask, + APInt &KnownUndef, + APInt &KnownZero) { + unsigned NumElts = Mask.size(); + KnownUndef = KnownZero = APInt::getNullValue(NumElts); + + for (unsigned i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (SM_SentinelUndef == M) + KnownUndef.setBit(i); + if (SM_SentinelZero == M) + KnownZero.setBit(i); + } +} + // Forward declaration (for getFauxShuffleMask recursive check). -static bool resolveTargetShuffleInputs(SDValue Op, - SmallVectorImpl<SDValue> &Inputs, - SmallVectorImpl<int> &Mask, - SelectionDAG &DAG); +// TODO: Use DemandedElts variant. +static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + SelectionDAG &DAG, unsigned Depth, + bool ResolveKnownElts); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the @@ -6644,7 +6852,8 @@ static bool resolveTargetShuffleInputs(SDValue Op, static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, - SelectionDAG &DAG) { + SelectionDAG &DAG, unsigned Depth, + bool ResolveKnownElts) { Mask.clear(); Ops.clear(); @@ -6685,7 +6894,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Mask.push_back(SM_SentinelUndef); continue; } - uint64_t ByteBits = EltBits[i].getZExtValue(); + const APInt &ByteBits = EltBits[i]; if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); @@ -6696,8 +6905,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, case ISD::OR: { // Inspect each operand at the byte level. We can merge these into a // blend shuffle mask if for each byte at least one is masked out (zero). - KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts); - KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts); + KnownBits Known0 = + DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1); + KnownBits Known1 = + DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1); if (Known0.One.isNullValue() && Known1.One.isNullValue()) { bool IsByteMask = true; unsigned NumSizeInBytes = NumSizeInBits / 8; @@ -6736,14 +6947,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, return false; SmallVector<int, 64> SrcMask0, SrcMask1; SmallVector<SDValue, 2> SrcInputs0, SrcInputs1; - if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) || - !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG)) + if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1, + true) || + !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, + true)) return false; - int MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); + size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector<int, 64> Mask0, Mask1; scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0); scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1); - for (int i = 0; i != MaskSize; ++i) { + for (size_t i = 0; i != MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) @@ -6751,14 +6964,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, else if (Mask1[i] == SM_SentinelZero) Mask.push_back(Mask0[i]); else if (Mask0[i] == SM_SentinelZero) - Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size())); + Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size())); else return false; } - for (SDValue &Op : SrcInputs0) - Ops.push_back(Op); - for (SDValue &Op : SrcInputs1) - Ops.push_back(Op); + Ops.append(SrcInputs0.begin(), SrcInputs0.end()); + Ops.append(SrcInputs1.begin(), SrcInputs1.end()); return true; } case ISD::INSERT_SUBVECTOR: { @@ -6786,8 +6997,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector<int, 64> SubMask; SmallVector<SDValue, 2> SubInputs; - if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, - SubMask, DAG)) + if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, + SubMask, DAG, Depth + 1, ResolveKnownElts)) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || @@ -6911,14 +7122,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { if ((!N0.isUndef() && - DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) || + DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || (!N1.isUndef() && - DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt)) + DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) || - (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS))) + if ((!N0.isUndef() && + !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || + (!N1.isUndef() && + !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } @@ -7061,23 +7274,45 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, Inputs = UsedInputs; } -/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs -/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the -/// remaining input indices in case we now have a unary shuffle and adjust the -/// inputs accordingly. +/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs +/// and then sets the SM_SentinelUndef and SM_SentinelZero values. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, - SmallVectorImpl<SDValue> &Inputs, - SmallVectorImpl<int> &Mask, - SelectionDAG &DAG) { +static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, + SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + APInt &KnownUndef, APInt &KnownZero, + SelectionDAG &DAG, unsigned Depth, + bool ResolveKnownElts) { + EVT VT = Op.getValueType(); + if (!VT.isSimple() || !VT.isVector()) + return false; + + if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { + if (ResolveKnownElts) + resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); + return true; + } + if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, + ResolveKnownElts)) { + resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); + return true; + } + return false; +} + +static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + SelectionDAG &DAG, unsigned Depth = 0, + bool ResolveKnownElts = true) { + EVT VT = Op.getValueType(); + if (!VT.isSimple() || !VT.isVector()) + return false; + + APInt KnownUndef, KnownZero; unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); - if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG)) - return false; - - resolveTargetShuffleInputsAndMask(Inputs, Mask); - return true; + return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, + KnownZero, DAG, Depth, ResolveKnownElts); } /// Returns the scalar element that will make up the ith @@ -7414,7 +7649,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getIntPtrConstant(InsertPSMask, DL)); + DAG.getIntPtrConstant(InsertPSMask, DL, true)); return DAG.getBitcast(VT, Result); } @@ -7427,7 +7662,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); - SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8); + SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } @@ -7439,7 +7674,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, // the shuffle mask. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { SDValue Ptr = LD->getBasePtr(); - if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + if (!ISD::isNormalLoad(LD) || !LD->isSimple()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) @@ -7504,6 +7739,49 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, return SDValue(); } +// Recurse to find a LoadSDNode source and the accumulated ByteOffest. +static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { + if (ISD::isNON_EXTLoad(Elt.getNode())) { + auto *BaseLd = cast<LoadSDNode>(Elt); + if (!BaseLd->isSimple()) + return false; + Ld = BaseLd; + ByteOffset = 0; + return true; + } + + switch (Elt.getOpcode()) { + case ISD::BITCAST: + case ISD::TRUNCATE: + case ISD::SCALAR_TO_VECTOR: + return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); + case ISD::SRL: + if (isa<ConstantSDNode>(Elt.getOperand(1))) { + uint64_t Idx = Elt.getConstantOperandVal(1); + if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { + ByteOffset += Idx / 8; + return true; + } + } + break; + case ISD::EXTRACT_VECTOR_ELT: + if (isa<ConstantSDNode>(Elt.getOperand(1))) { + SDValue Src = Elt.getOperand(0); + unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); + unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); + if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && + findEltLoadSrc(Src, Ld, ByteOffset)) { + uint64_t Idx = Elt.getConstantOperandVal(1); + ByteOffset += Idx * (SrcSizeInBits / 8); + return true; + } + } + break; + } + + return false; +} + /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. @@ -7513,6 +7791,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { + if ((VT.getScalarSizeInBits() % 8) != 0) + return SDValue(); + unsigned NumElems = Elts.size(); int LastLoadedElt = -1; @@ -7521,6 +7802,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, APInt UndefMask = APInt::getNullValue(NumElems); SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); + SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an // undef. @@ -7539,13 +7821,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // Each loaded element must be the correct fractional portion of the // requested vector load. - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) + unsigned EltSizeInBits = Elt.getValueSizeInBits(); + if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) return SDValue(); - if (!ISD::isNON_EXTLoad(Elt.getNode())) + if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) + return SDValue(); + unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); + if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) return SDValue(); - Loads[i] = cast<LoadSDNode>(Elt); LoadMask.setBit(i); LastLoadedElt = i; } @@ -7575,6 +7860,24 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); + // TODO: Support offsetting the base load. + if (ByteOffsets[FirstLoadedElt] != 0) + return SDValue(); + + // Check to see if the element's load is consecutive to the base load + // or offset from a previous (already checked) load. + auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { + LoadSDNode *Ld = Loads[EltIdx]; + int64_t ByteOffset = ByteOffsets[EltIdx]; + if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { + int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); + return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && + Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); + } + return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, + EltIdx - FirstLoadedElt); + }; + // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. @@ -7582,8 +7885,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { - if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, - i - FirstLoadedElt)) { + if (!CheckConsecutiveLoad(LDBase, i)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; @@ -7595,8 +7897,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); - assert(!(MMOFlags & MachineMemOperand::MOVolatile) && - "Cannot merge volatile loads."); + assert(LDBase->isSimple() && + "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); @@ -7636,17 +7938,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!isAfterLegalize && VT.isVector()) { - SmallVector<int, 4> ClearMask(NumElems, -1); - for (unsigned i = 0; i < NumElems; ++i) { - if (ZeroMask[i]) - ClearMask[i] = i + NumElems; - else if (LoadMask[i]) - ClearMask[i] = i; + unsigned NumMaskElts = VT.getVectorNumElements(); + if ((NumMaskElts % NumElems) == 0) { + unsigned Scale = NumMaskElts / NumElems; + SmallVector<int, 4> ClearMask(NumMaskElts, -1); + for (unsigned i = 0; i < NumElems; ++i) { + if (UndefMask[i]) + continue; + int Offset = ZeroMask[i] ? NumMaskElts : 0; + for (unsigned j = 0; j != Scale; ++j) + ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; + } + SDValue V = CreateLoad(VT, LDBase); + SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) + : DAG.getConstantFP(0.0, DL, VT); + return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } - SDValue V = CreateLoad(VT, LDBase); - SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) - : DAG.getConstantFP(0.0, DL, VT); - return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } @@ -8194,34 +8501,10 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); - if (ISD::isBuildVectorAllZeros(Op.getNode())) + if (ISD::isBuildVectorAllZeros(Op.getNode()) || + ISD::isBuildVectorAllOnes(Op.getNode())) return Op; - if (ISD::isBuildVectorAllOnes(Op.getNode())) - return Op; - - if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { - // Split the pieces. - SDValue Lower = - DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32)); - SDValue Upper = - DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32)); - // We have to manually lower both halves so getNode doesn't try to - // reassemble the build_vector. - Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget); - Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper); - } - SDValue Imm = ConvertI1VectorToInteger(Op, DAG); - if (Imm.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, Imm); - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, - DAG.getIntPtrConstant(0, dl)); - } - - // Vector has one or more non-const elements uint64_t Immediate = 0; SmallVector<unsigned, 16> NonConstIdx; bool IsSplat = true; @@ -8244,29 +8527,40 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" - if (IsSplat) - return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), + if (IsSplat) { + // The build_vector allows the scalar element to be larger than the vector + // element type. We need to mask it to use as a condition unless we know + // the upper bits are zero. + // FIXME: Use computeKnownBits instead of checking specific opcode? + SDValue Cond = Op.getOperand(SplatIdx); + assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!"); + if (Cond.getOpcode() != ISD::SETCC) + Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, + DAG.getConstant(1, dl, MVT::i8)); + return DAG.getSelect(dl, VT, Cond, DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); + } // insert elements one by one SDValue DstVec; - SDValue Imm; - if (Immediate) { - MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); - Imm = DAG.getConstant(Immediate, dl, ImmVT); - } - else if (HasConstElts) - Imm = DAG.getConstant(0, dl, VT); - else - Imm = DAG.getUNDEF(VT); - if (Imm.getValueSizeInBits() == VT.getSizeInBits()) - DstVec = DAG.getBitcast(VT, Imm); - else { - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); - DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, - DAG.getIntPtrConstant(0, dl)); - } + if (HasConstElts) { + if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { + SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); + SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); + ImmL = DAG.getBitcast(MVT::v32i1, ImmL); + ImmH = DAG.getBitcast(MVT::v32i1, ImmH); + DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); + } else { + MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U)); + SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); + MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; + DstVec = DAG.getBitcast(VecVT, Imm); + DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, + DAG.getIntPtrConstant(0, dl)); + } + } else + DstVec = DAG.getUNDEF(VT); for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { unsigned InsertIdx = NonConstIdx[i]; @@ -8757,7 +9051,7 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, // If we don't need the upper xmm, then perform as a xmm hop. unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { - MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); @@ -8965,21 +9259,14 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. - if (ISD::isBuildVectorAllZeros(Op.getNode())) { - // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd - // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) - return Op; - - return getZeroVector(VT, Subtarget, DAG, DL); - } + if (ISD::isBuildVectorAllZeros(Op.getNode())) + return Op; // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { - if (VT == MVT::v4i32 || VT == MVT::v16i32 || - (VT == MVT::v8i32 && Subtarget.hasInt256())) + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getOnesVector(VT, DAG, DL); @@ -9150,9 +9437,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); if (Subtarget.hasXOP()) - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, - LoLo, HiHi, IndicesVec, - DAG.getConstant(0, DL, MVT::i8))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, + IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPS only uses index bits[0:1] to permute elements. SDValue Res = DAG.getSelectCC( @@ -9186,9 +9473,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); if (Subtarget.hasXOP()) - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, - LoLo, HiHi, IndicesVec, - DAG.getConstant(0, DL, MVT::i8))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, + IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPD only uses index bit[1] to permute elements. SDValue Res = DAG.getSelectCC( @@ -9283,7 +9570,7 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, return SDValue(); auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); - if (!PermIdx || PermIdx->getZExtValue() != Idx) + if (!PermIdx || PermIdx->getAPIntValue() != Idx) return SDValue(); } @@ -9434,23 +9721,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - if (VT.getSizeInBits() >= 256) { - MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); - if (Subtarget.hasAVX()) { - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); - } else { - // Without AVX, we need to extend to a 128-bit vector and then - // insert into the 256-bit vector. - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl); - Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl); - } - } else { - assert(VT.is128BitVector() && "Expected an SSE value type!"); - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); - } + MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); return DAG.getBitcast(VT, Item); } } @@ -9549,8 +9822,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. - return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl, - VT.getSizeInBits() / 2); + return concatSubVectors(Lower, Upper, DAG, dl); } // Let legalizer expand 2-wide build_vectors. @@ -9703,8 +9975,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, // If we have more than 2 non-zeros, build each half separately. if (NumNonZero > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef<SDUse> Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); @@ -9745,30 +10016,47 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); - unsigned NumZero = 0; - unsigned NumNonZero = 0; + uint64_t Zeros = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; + assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - ++NumZero; - else { - assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. + Zeros |= (uint64_t)1 << i; + else NonZeros |= (uint64_t)1 << i; - ++NumNonZero; - } } + unsigned NumElems = ResVT.getVectorNumElements(); + + // If we are inserting non-zero vector and there are zeros in LSBs and undef + // in the MSBs we need to emit a KSHIFTL. The generic lowering to + // insert_subvector will give us two kshifts. + if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && + Log2_64(NonZeros) != NumOperands - 1) { + MVT ShiftVT = ResVT; + if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) + ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + unsigned Idx = Log2_64(NonZeros); + SDValue SubVec = Op.getOperand(Idx); + unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT, + DAG.getUNDEF(ShiftVT), SubVec, + DAG.getIntPtrConstant(0, dl)); + Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec, + DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, + DAG.getIntPtrConstant(0, dl)); + } // If there are zero or one non-zeros we can handle this very simply. - if (NumNonZero <= 1) { - SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) - : DAG.getUNDEF(ResVT); - if (!NumNonZero) + if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { + SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); + if (!NonZeros) return Vec; - unsigned Idx = countTrailingZeros(NonZeros); + unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, @@ -9776,8 +10064,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, } if (NumOperands > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef<SDUse> Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); @@ -9786,7 +10073,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } - assert(NumNonZero == 2 && "Simple cases not handled?"); + assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK @@ -9794,7 +10081,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); - unsigned NumElems = ResVT.getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getIntPtrConstant(NumElems/2, dl)); } @@ -9997,42 +10283,44 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// -/// SM_SentinelZero is accepted as a valid negative index but must match in both. +/// SM_SentinelZero is accepted as a valid negative index but must match in +/// both. static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, - ArrayRef<int> ExpectedMask) { + ArrayRef<int> ExpectedMask, + SDValue V1 = SDValue(), + SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && "Illegal target shuffle mask"); - for (int i = 0; i < Size; ++i) - if (Mask[i] == SM_SentinelUndef) - continue; - else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) - return false; - else if (Mask[i] != ExpectedMask[i]) - return false; - - return true; -} + // Check for out-of-range target shuffle mask indices. + if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) + return false; -// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle -// mask. -static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask, - const APInt &Zeroable) { - int NumElts = Mask.size(); - assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1); + auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2); + BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1); + BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2); - SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef); - for (int i = 0; i != NumElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) + for (int i = 0; i < Size; ++i) { + if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i]) continue; - assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); - TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); + if (0 <= Mask[i] && 0 <= ExpectedMask[i]) { + auto *MaskBV = Mask[i] < Size ? BV1 : BV2; + auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; + if (MaskBV && ExpectedBV && + MaskBV->getOperand(Mask[i] % Size) == + ExpectedBV->getOperand(ExpectedMask[i] % Size)) + continue; + } + // TODO - handle SM_Sentinel equivalences. + return false; } - return TargetMask; + return true; } // Attempt to create a shuffle mask from a VSELECT condition mask. @@ -10133,7 +10421,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, SelectionDAG &DAG) { - return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); + return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// Compute whether each element of a shuffle is zeroable. @@ -10573,14 +10861,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, // Try binary shuffle. SmallVector<int, 32> BinaryMask; createPackShuffleMask(VT, BinaryMask, false); - if (isTargetShuffleEquivalent(TargetMask, BinaryMask)) + if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2)) if (MatchPACK(V1, V2)) return true; // Try unary shuffle. SmallVector<int, 32> UnaryMask; createPackShuffleMask(VT, UnaryMask, true); - if (isTargetShuffleEquivalent(TargetMask, UnaryMask)) + if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1)) if (MatchPACK(V1, V1)) return true; @@ -10685,9 +10973,9 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SelectionDAG &DAG); static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, - MutableArrayRef<int> TargetMask, - bool &ForceV1Zero, bool &ForceV2Zero, - uint64_t &BlendMask) { + MutableArrayRef<int> Mask, + const APInt &Zeroable, bool &ForceV1Zero, + bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = @@ -10695,13 +10983,12 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; - assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); + assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. - // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - int M = TargetMask[i]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; if (M == SM_SentinelUndef) continue; if (M == i) @@ -10710,16 +10997,16 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, BlendMask |= 1ull << i; continue; } - if (M == SM_SentinelZero) { + if (Zeroable[i]) { if (V1IsZeroOrUndef) { ForceV1Zero = true; - TargetMask[i] = i; + Mask[i] = i; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; BlendMask |= 1ull << i; - TargetMask[i] = i + Size; + Mask[i] = i + Size; continue; } } @@ -10748,11 +11035,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); - uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; - if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, + SmallVector<int, 64> Mask(Original.begin(), Original.end()); + if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); @@ -10778,7 +11064,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v8i16: assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); + DAG.getTargetConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector<int, 8> RepeatedMask; @@ -10790,7 +11076,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); + DAG.getTargetConstant(BlendMask, DL, MVT::i8)); } // Use PBLENDW for lower/upper lanes and then blend lanes. // TODO - we should allow 2 PBLENDW here and leave shuffle combine to @@ -10799,9 +11085,9 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, uint64_t HiMask = (BlendMask >> 8) & 0xFF; if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(LoMask, DL, MVT::i8)); + DAG.getTargetConstant(LoMask, DL, MVT::i8)); SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(HiMask, DL, MVT::i8)); + DAG.getTargetConstant(HiMask, DL, MVT::i8)); return DAG.getVectorShuffle( MVT::v16i16, DL, Lo, Hi, {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); @@ -11061,7 +11347,7 @@ static SDValue lowerShuffleAsByteRotateAndPermute( SDValue Rotate = DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), DAG.getBitcast(ByteVT, Lo), - DAG.getConstant(Scale * RotAmt, DL, MVT::i8))); + DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { @@ -11268,7 +11554,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, - DAG.getConstant(ByteRotation, DL, MVT::i8))); + DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && @@ -11282,10 +11568,12 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; - SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, - DAG.getConstant(LoByteShift, DL, MVT::i8)); - SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, - DAG.getConstant(HiByteShift, DL, MVT::i8)); + SDValue LoShift = + DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, + DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); + SDValue HiShift = + DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, + DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } @@ -11317,7 +11605,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, return SDValue(); return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, - DAG.getConstant(Rotation, DL, MVT::i8)); + DAG.getTargetConstant(Rotation, DL, MVT::i8)); } /// Try to lower a vector shuffle as a byte shift sequence. @@ -11356,27 +11644,27 @@ static SDValue lowerVectorShuffleAsByteShiftMask( if (ZeroLo == 0) { unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); + DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); } else if (ZeroHi == 0) { unsigned Shift = Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else if (!Subtarget.hasSSSE3()) { // If we don't have PSHUFB then its worth avoiding an AND constant mask // by performing 3 byte shifts. Shuffle combining can kick in above that. // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Shift += Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else return SDValue(); @@ -11498,7 +11786,7 @@ static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i8)); + DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } @@ -11632,14 +11920,14 @@ static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return SDValue(); } @@ -11686,9 +11974,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; - // Found a valid zext mask! Try various lowering strategies based on the + // Found a valid a/zext mask! Try various lowering strategies based on the // input type and available ISA extensions. - // TODO: Add AnyExt support. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. @@ -11697,7 +11984,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL, + ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -11736,8 +12024,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(LoIdx, DL, MVT::i8))); + DAG.getTargetConstant(EltBits, DL, MVT::i8), + DAG.getTargetConstant(LoIdx, DL, MVT::i8))); if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); @@ -11745,8 +12033,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(HiIdx, DL, MVT::i8))); + DAG.getTargetConstant(EltBits, DL, MVT::i8), + DAG.getTargetConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } @@ -11759,8 +12047,12 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); - PSHUFBMask[i] = DAG.getConstant( - (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); + if ((i % Scale == 0 && SafeOffset(Idx))) { + PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8); + continue; + } + PSHUFBMask[i] = + AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( @@ -12052,9 +12344,9 @@ static SDValue lowerShuffleAsElementInsertion( V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); - V2 = DAG.getNode( - X86ISD::VSHLDQ, DL, MVT::v16i8, V2, - DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8)); + V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2, + DAG.getTargetConstant( + V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8)); V2 = DAG.getBitcast(VT, V2); } } @@ -12294,7 +12586,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { + } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); @@ -12486,7 +12778,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } /// Try to lower a shuffle as a permute of the inputs followed by an @@ -12635,14 +12927,14 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"); @@ -12688,7 +12980,7 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit integer shuffles. @@ -12996,10 +13288,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) - return Broadcast; + // Try to use broadcast unless the mask only has one non-undef element. + if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + } // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. @@ -13680,16 +13974,16 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG)) - return Broadcast; - // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG)) return Shift; + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; @@ -13984,8 +14278,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. + bool EvenInUse = false, OddInUse = false; + for (int i = 0; i < 16; i += 2) { + EvenInUse |= (Mask[i + 0] >= 0); + OddInUse |= (Mask[i + 1] >= 0); + if (EvenInUse && OddInUse) + break; + } V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, - MVT::v16i8, V1, V1); + MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8), + OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8)); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) @@ -14100,11 +14402,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); - // We use the mask type to pick which bytes are preserved based on how many - // elements are dropped. - MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; - SDValue ByteClearMask = DAG.getBitcast( - MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); + SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8)); + for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops) + ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8); + SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); @@ -14448,16 +14749,14 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); } -/// Lower a vector shuffle crossing multiple 128-bit lanes as -/// a permutation and blend of those lanes. +/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one +/// source with a lane permutation. /// -/// This essentially blends the out-of-lane inputs to each lane into the lane -/// from a permuted copy of the vector. This lowering strategy results in four -/// instructions in the worst case for a single-input cross lane shuffle which -/// is lower than any other fully general cross-lane shuffle strategy I'm aware -/// of. Special cases for each particular shuffle pattern should be handled -/// prior to trying this lowering. -static SDValue lowerShuffleAsLanePermuteAndBlend( +/// This lowering strategy results in four instructions in the worst case for a +/// single-input cross lane shuffle which is lower than any other fully general +/// cross-lane shuffle strategy I'm aware of. Special cases for each particular +/// shuffle pattern should be handled prior to trying this lowering. +static SDValue lowerShuffleAsLanePermuteAndShuffle( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. @@ -14484,24 +14783,28 @@ static SDValue lowerShuffleAsLanePermuteAndBlend( return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } + // TODO - we could support shuffling V2 in the Flipped input. assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); - SmallVector<int, 32> FlippedBlendMask(Size); - for (int i = 0; i < Size; ++i) - FlippedBlendMask[i] = - Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) - ? Mask[i] - : Mask[i] % LaneSize + - (i / LaneSize) * LaneSize + Size); + SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end()); + for (int i = 0; i < Size; ++i) { + int &M = InLaneMask[i]; + if (M < 0) + continue; + if (((M % Size) / LaneSize) != (i / LaneSize)) + M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; + } + assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && + "In-lane shuffle mask expected"); - // Flip the vector, and blend the results which should now be in-lane. + // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); - Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), - { 2, 3, 0, 1 }); + Flipped = + DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1}); Flipped = DAG.getBitcast(VT, Flipped); - return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask); } /// Handle lowering 2-lane 128-bit shuffles. @@ -14565,8 +14868,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { unsigned PermMask = ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); - return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, - DAG.getConstant(PermMask, DL, MVT::i8)); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } @@ -14598,7 +14901,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, V2 = DAG.getUNDEF(VT); return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, - DAG.getConstant(PermMask, DL, MVT::i8)); + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Lower a vector shuffle by first fixing the 128-bit lanes and then @@ -14616,26 +14919,26 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); - int Size = Mask.size(); + int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; - int LaneSize = 128 / VT.getScalarSizeInBits(); - SmallVector<int, 16> RepeatMask(LaneSize, -1); + int NumLaneElts = 128 / VT.getScalarSizeInBits(); + SmallVector<int, 16> RepeatMask(NumLaneElts, -1); SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. for (int Lane = 0; Lane != NumLanes; ++Lane) { - int Srcs[2] = { -1, -1 }; - SmallVector<int, 16> InLaneMask(LaneSize, -1); - for (int i = 0; i != LaneSize; ++i) { - int M = Mask[(Lane * LaneSize) + i]; + int Srcs[2] = {-1, -1}; + SmallVector<int, 16> InLaneMask(NumLaneElts, -1); + for (int i = 0; i != NumLaneElts; ++i) { + int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. - int LaneSrc = M / LaneSize; + int LaneSrc = M / NumLaneElts; int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; @@ -14645,7 +14948,7 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( return SDValue(); Srcs[Src] = LaneSrc; - InLaneMask[i] = (M % LaneSize) + Src * Size; + InLaneMask[i] = (M % NumLaneElts) + Src * NumElts; } // If this lane has two sources, see if it fits with the repeat mask so far. @@ -14701,23 +15004,23 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( if (LaneSrcs[Lane][0] >= 0) continue; - for (int i = 0; i != LaneSize; ++i) { - int M = Mask[(Lane * LaneSize) + i]; + for (int i = 0; i != NumLaneElts; ++i) { + int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // If RepeatMask isn't defined yet we can define it ourself. if (RepeatMask[i] < 0) - RepeatMask[i] = M % LaneSize; + RepeatMask[i] = M % NumLaneElts; - if (RepeatMask[i] < Size) { - if (RepeatMask[i] != M % LaneSize) + if (RepeatMask[i] < NumElts) { + if (RepeatMask[i] != M % NumLaneElts) return SDValue(); - LaneSrcs[Lane][0] = M / LaneSize; + LaneSrcs[Lane][0] = M / NumLaneElts; } else { - if (RepeatMask[i] != ((M % LaneSize) + Size)) + if (RepeatMask[i] != ((M % NumLaneElts) + NumElts)) return SDValue(); - LaneSrcs[Lane][1] = M / LaneSize; + LaneSrcs[Lane][1] = M / NumLaneElts; } } @@ -14725,14 +15028,14 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( return SDValue(); } - SmallVector<int, 16> NewMask(Size, -1); + SmallVector<int, 16> NewMask(NumElts, -1); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][0]; - for (int i = 0; i != LaneSize; ++i) { + for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) - M = Src * LaneSize + i; - NewMask[Lane * LaneSize + i] = M; + M = Src * NumLaneElts + i; + NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); @@ -14745,11 +15048,11 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][1]; - for (int i = 0; i != LaneSize; ++i) { + for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) - M = Src * LaneSize + i; - NewMask[Lane * LaneSize + i] = M; + M = Src * NumLaneElts + i; + NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); @@ -14760,12 +15063,12 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask) return SDValue(); - for (int i = 0; i != Size; ++i) { - NewMask[i] = RepeatMask[i % LaneSize]; + for (int i = 0; i != NumElts; ++i) { + NewMask[i] = RepeatMask[i % NumLaneElts]; if (NewMask[i] < 0) continue; - NewMask[i] += (i / LaneSize) * LaneSize; + NewMask[i] += (i / NumLaneElts) * NumLaneElts; } return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } @@ -14831,14 +15134,13 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask, static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef<int> HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, - SelectionDAG &DAG) { + SelectionDAG &DAG, bool UseConcat = false) { assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); assert(V1.getValueType().isSimple() && "Expecting only simple types"); MVT VT = V1.getSimpleValueType(); - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + unsigned HalfNumElts = HalfVT.getVectorNumElements(); auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) @@ -14853,6 +15155,14 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, SDValue Half1 = getHalfVector(HalfIdx1); SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + if (UseConcat) { + SDValue Op0 = V; + SDValue Op1 = DAG.getUNDEF(HalfVT); + if (UndefLower) + std::swap(Op0, Op1); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1); + } + unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); @@ -14877,9 +15187,8 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + unsigned HalfNumElts = HalfVT.getVectorNumElements(); if (!UndefLower && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, @@ -15155,11 +15464,19 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, - unsigned &ShuffleImm, ArrayRef<int> Mask) { + bool &ForceV1Zero, bool &ForceV2Zero, + unsigned &ShuffleImm, ArrayRef<int> Mask, + const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); + assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && + "Illegal shuffle mask"); + + bool ZeroLane[2] = { true, true }; + for (int i = 0; i < NumElts; ++i) + ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. @@ -15167,7 +15484,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { - if (Mask[i] == SM_SentinelUndef) + if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1]) continue; if (Mask[i] < 0) return false; @@ -15180,30 +15497,77 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, ShuffleImm |= (Mask[i] % 2) << i; } - if (ShufpdMask) - return true; - if (CommutableMask) { + if (!ShufpdMask && !CommutableMask) + return false; + + if (!ShufpdMask && CommutableMask) std::swap(V1, V2); - return true; - } - return false; + ForceV1Zero = ZeroLane[0]; + ForceV2Zero = ZeroLane[1]; + return true; } -static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& +static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; - if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + bool ForceV1Zero = false, ForceV2Zero = false; + if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, + Mask, Zeroable)) return SDValue(); + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, - DAG.getConstant(Immediate, DL, MVT::i8)); + DAG.getTargetConstant(Immediate, DL, MVT::i8)); } +// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed +// by zeroable elements in the remaining 24 elements. Turn this into two +// vmovqb instructions shuffled together. +static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + assert(VT == MVT::v32i8 && "Unexpected type!"); + + // The first 8 indices should be every 8th element. + if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) + return SDValue(); + + // Remaining elements need to be zeroable. + if (Zeroable.countLeadingOnes() < (Mask.size() - 8)) + return SDValue(); + + V1 = DAG.getBitcast(MVT::v4i64, V1); + V2 = DAG.getBitcast(MVT::v4i64, V2); + + V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); + V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); + + // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in + // the upper bits of the result using an unpckldq. + SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, + { 0, 1, 2, 3, 16, 17, 18, 19, + 4, 5, 6, 7, 20, 21, 22, 23 }); + // Insert the unpckldq into a zero vector to widen to v32i8. + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, + DAG.getConstant(0, DL, MVT::v32i8), Unpack, + DAG.getIntPtrConstant(0, DL)); +} + + /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -15236,7 +15600,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, - DAG.getConstant(VPERMILPMask, DL, MVT::i8)); + DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. @@ -15256,8 +15620,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; // Otherwise, fall back. - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, - Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask, + DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. @@ -15269,7 +15633,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Op; // If we have one input in place, then we can permute the other input and @@ -15473,8 +15838,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -15681,8 +16046,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask, + DAG, Subtarget); } SmallVector<int, 8> RepeatedMask; @@ -15780,8 +16145,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, - Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, + DAG, Subtarget); } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, @@ -15803,6 +16168,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; + // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed + // by zeroable elements in the remaining 24 elements. Turn this into two + // vmovqb instructions shuffled together. + if (Subtarget.hasVLX()) + if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, + Mask, Zeroable, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); @@ -15974,7 +16347,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], - DAG.getConstant(PermMask, DL, MVT::i8)); + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Handle lowering of 8-lane 64-bit floating point shuffles. @@ -15999,7 +16372,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, - DAG.getConstant(VPERMILPMask, DL, MVT::i8)); + DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } SmallVector<int, 4> RepeatedMask; @@ -16016,7 +16389,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Op; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, @@ -16389,6 +16763,49 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, } } +static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + // Shuffle should be unary. + if (!V2.isUndef()) + return SDValue(); + + int ShiftAmt = -1; + int NumElts = Mask.size(); + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) && + "Unexpected mask index."); + if (M < 0) + continue; + + // The first non-undef element determines our shift amount. + if (ShiftAmt < 0) { + ShiftAmt = M - i; + // Need to be shifting right. + if (ShiftAmt <= 0) + return SDValue(); + } + // All non-undef elements must shift by the same amount. + if (ShiftAmt != M - i) + return SDValue(); + } + assert(ShiftAmt >= 0 && "All undef?"); + + // Great we found a shift right. + MVT WideVT = VT; + if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) + WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, + DAG.getUNDEF(WideVT), V1, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res, + DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); +} + // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified // version of matchShuffleAsShift. @@ -16434,13 +16851,20 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); - unsigned NumElts = Mask.size(); + int NumElts = Mask.size(); // Try to recognize shuffles that are just padding a subvector with zeros. - unsigned SubvecElts = 0; - for (int i = 0; i != (int)NumElts; ++i) { - if (Mask[i] >= 0 && Mask[i] != i) - break; + int SubvecElts = 0; + int Src = -1; + for (int i = 0; i != NumElts; ++i) { + if (Mask[i] >= 0) { + // Grab the source from the first valid mask. All subsequent elements need + // to use this same source. + if (Src < 0) + Src = Mask[i] / NumElts; + if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i) + break; + } ++SubvecElts; } @@ -16451,30 +16875,54 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Make sure the number of zeroable bits in the top at least covers the bits // not covered by the subvector. - if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { + if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { + assert(Src >= 0 && "Expected a source!"); MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, - V1, DAG.getIntPtrConstant(0, DL)); + Src == 0 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - getZeroVector(VT, Subtarget, DAG, DL), + DAG.getConstant(0, DL, VT), Extract, DAG.getIntPtrConstant(0, DL)); } + // Try a simple shift right with undef elements. Later we'll try with zeros. + if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, + DAG)) + return Shift; + // Try to match KSHIFTs. - // TODO: Support narrower than legal shifts by widening and extracting. - if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) { - unsigned Offset = 0; - for (SDValue V : { V1, V2 }) { - unsigned Opcode; - int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); - if (ShiftAmt >= 0) - return DAG.getNode(Opcode, DL, VT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i8)); - Offset += NumElts; // Increment for next iteration. + unsigned Offset = 0; + for (SDValue V : { V1, V2 }) { + unsigned Opcode; + int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); + if (ShiftAmt >= 0) { + MVT WideVT = VT; + if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) + WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, + DAG.getUNDEF(WideVT), V, + DAG.getIntPtrConstant(0, DL)); + // Widened right shifts need two shifts to ensure we shift in zeroes. + if (Opcode == X86ISD::KSHIFTR && WideVT != VT) { + int WideElts = WideVT.getVectorNumElements(); + // Shift left to put the original vector in the MSBs of the new size. + Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res, + DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8)); + // Increase the shift amount to account for the left shift. + ShiftAmt += WideElts - NumElts; + } + + Res = DAG.getNode(Opcode, DL, WideVT, Res, + DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); } + Offset += NumElts; // Increment for next iteration. } + MVT ExtVT; switch (VT.SimpleTy) { default: @@ -16594,7 +17042,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); + ArrayRef<int> OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); @@ -16620,8 +17068,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef && - any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { - SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { + SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; @@ -16629,15 +17077,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, } // Check for illegal shuffle mask element index values. - int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; - assert(llvm::all_of(Mask, + int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); + (void)MaskUpperLimit; + assert(llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2); if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); @@ -16645,11 +17094,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. - SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end()); + SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end()); if (V2IsZero) { assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); for (int i = 0; i != NumElements; ++i) - if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } @@ -16664,7 +17113,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG)) return Broadcast; @@ -16700,8 +17149,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, } // Commute the shuffle if it will improve canonicalization. - if (canonicalizeShuffleMaskWithCommute(Mask)) - return DAG.getCommutedVectorShuffle(*SVOp); + SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end()); + if (canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; @@ -16910,7 +17362,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); @@ -17137,8 +17589,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); - N2 = DAG.getIntPtrConstant(1, dl); - return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, + DAG.getTargetConstant(1, dl, MVT::i8)); } } @@ -17207,14 +17659,14 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. - N2 = DAG.getIntPtrConstant(1, dl); N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, + DAG.getTargetConstant(1, dl, MVT::i8)); } - N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, + DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8)); } // PINSR* works with constant index. @@ -17300,7 +17752,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, // Shift to the LSB. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); @@ -17841,10 +18293,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, std::swap(Op0, Op1); APInt APIntShiftAmt; - if (isConstantSplat(Amt, APIntShiftAmt)) { + if (X86::isConstantSplat(Amt, APIntShiftAmt)) { uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); - return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, - Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, + Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); } return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, @@ -17970,6 +18422,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (VT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; @@ -18072,6 +18527,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, return Result; } +/// Horizontal vector math instructions may be slower than normal math with +/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch +/// implementation, and likely shuffle complexity of the alternate sequence. +static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool HasFastHOps = Subtarget.hasFastHorizontalOps(); + return !IsSingleSource || IsOptimizingSize || HasFastHOps; +} + /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -18126,8 +18591,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3()) { - // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'. + if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); @@ -18273,7 +18737,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, - VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); + VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); @@ -18281,7 +18745,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, - VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); + VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; @@ -18329,16 +18793,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue N0 = Op.getOperand(0); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); + MVT SrcVT = N0.getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); + + if (DstVT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); - if (Op.getSimpleValueType().isVector()) + if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; - MVT SrcVT = N0.getSimpleValueType(); - MVT DstVT = Op.getSimpleValueType(); - if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, @@ -18346,6 +18812,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return Op; } + // Promote i32 to i64 and use a signed conversion on 64-bit targets. + if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); + return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); + } + if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; @@ -18579,7 +19051,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) + if (VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -18602,10 +19074,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // - - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - + MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); // Short-circuit if we can determine that each 128-bit half is the same value. @@ -18903,9 +19372,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // If called by the legalizer just return. - if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + // If we're called by the type legalizer, handle a few cases. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(InVT)) { + if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && + VT.is128BitVector()) { + assert(Subtarget.hasVLX() && "Unexpected subtarget!"); + // The default behavior is to truncate one step, concatenate, and then + // truncate the remainder. We'd rather produce two 64-bit results and + // concatenate those. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, DL); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo); + Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + + // Otherwise let default legalization handle it. return SDValue(); + } if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); @@ -18940,6 +19429,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) return V; + // Handle truncation of V256 to V128 using shuffles. + assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { @@ -19016,22 +19508,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi); } - // Handle truncation of V256 to V128 using shuffles. - assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); - - assert(Subtarget.hasAVX() && "256-bit vector without AVX!"); - - unsigned NumElems = VT.getVectorNumElements(); - MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); - - SmallVector<int, 16> MaskVec(NumElems * 2, -1); - // Prepare truncation shuffle mask - for (unsigned i = 0; i != NumElems; ++i) - MaskVec[i] = i * 2; - In = DAG.getBitcast(NVT, In); - SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, - DAG.getIntPtrConstant(0, DL)); + llvm_unreachable("All 256->128 cases should have been handled above!"); } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { @@ -19041,6 +19518,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); + if (SrcVT == MVT::f128) { + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVT, VT); + else + LC = RTLIB::getFPTOUINT(SrcVT, VT); + + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first; + } + if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; @@ -19075,14 +19563,27 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); - if (!IsSigned && Subtarget.hasAVX512()) { - // Conversions from f32/f64 should be legal. - if (UseSSEReg) + if (!IsSigned && UseSSEReg) { + // Conversions from f32/f64 with AVX512 should be legal. + if (Subtarget.hasAVX512()) return Op; - // Use default expansion. + // Use default expansion for i64. if (VT == MVT::i64) return SDValue(); + + assert(VT == MVT::i32 && "Unexpected VT!"); + + // Promote i32 to i64 and use a signed operation on 64-bit targets. + if (Subtarget.is64Bit()) { + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can + // use fisttp which will be handled later. + if (!Subtarget.hasSSE3()) + return SDValue(); } // Promote i16 to i32 if we can use a SSE operation. @@ -19103,12 +19604,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } -static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); + if (VT == MVT::f128) { + RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); + return LowerF128Call(Op, DAG, LC); + } + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, @@ -19116,14 +19622,31 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -/// Horizontal vector math instructions may be slower than normal math with -/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch -/// implementation, and likely shuffle complexity of the alternate sequence. -static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); - bool HasFastHOps = Subtarget.hasFastHorizontalOps(); - return !IsSingleSource || IsOptimizingSize || HasFastHOps; +SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getSimpleValueType(); + + // It's legal except when f128 is involved + if (SVT != MVT::f128) + return Op; + + RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT); + + // FP_ROUND node has a second operand indicating whether it is known to be + // precise. That doesn't take part in the LibCall so we can't directly use + // LowerF128Call. + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first; +} + +// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking +// the default expansion of STRICT_FP_ROUND. +static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) { + // FIXME: Need to form a libcall with an input chain for f128. + assert(Op.getOperand(0).getValueType() != MVT::f128 && + "Don't know how to handle f128 yet!"); + return Op; } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -19200,8 +19723,13 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. -static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::f128) { + RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128 + : RTLIB::SUB_F128; + return LowerF128Call(Op, DAG, LC); + } + assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); @@ -19358,13 +19886,13 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); + DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS); } /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) /// style scalarized (associative) reduction patterns. -static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp, - SmallVectorImpl<SDValue> &SrcOps) { +static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, + SmallVectorImpl<SDValue> &SrcOps) { SmallVector<SDValue, 8> Opnds; DenseMap<SDValue, APInt> SrcOpMap; EVT VT = MVT::Other; @@ -19437,7 +19965,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, return SDValue(); SmallVector<SDValue, 8> VecIns; - if (!matchBitOpReduction(Op, ISD::OR, VecIns)) + if (!matchScalarReduction(Op, ISD::OR, VecIns)) return SDValue(); // Quit if not 128/256-bit vector. @@ -19461,8 +19989,8 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, - MVT::i8); + X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, + DL, MVT::i8); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } @@ -19576,6 +20104,13 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); + case ISD::SSUBO: + case ISD::USUBO: { + // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag. + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), + Op->getOperand(1)).getValue(1); + } default: default_case: break; @@ -19766,6 +20301,63 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } +SDValue +X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const { + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N,0); // Lower SDIV as SDIV + + assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && + "Unexpected divisor!"); + + // Only perform this transform if CMOV is supported otherwise the select + // below will become a branch. + if (!Subtarget.hasCMov()) + return SDValue(); + + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + // FIXME: Support i8. + if (VT != MVT::i16 && VT != MVT::i32 && + !(Subtarget.is64Bit() && VT == MVT::i64)) + return SDValue(); + + unsigned Lg2 = Divisor.countTrailingZeros(); + + // If the divisor is 2 or -2, the default expansion is better. + if (Lg2 == 1) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue Zero = DAG.getConstant(0, DL, VT); + APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2); + SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT); + + // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right. + SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); + SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0); + + Created.push_back(Cmp.getNode()); + Created.push_back(Add.getNode()); + Created.push_back(CMov.getNode()); + + // Divide by pow2. + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64)); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (Divisor.isNonNegative()) + return SRA; + + Created.push_back(SRA.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA); +} + /// Result of 'and' is compared against zero. Change to a BT node if possible. /// Returns the BT node and the condition code needed to use it. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, @@ -19842,8 +20434,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, if (Src.getValueType() != BitNo.getValueType()) BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, - dl, MVT::i8); + X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, + dl, MVT::i8); return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); } @@ -19935,13 +20527,6 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); - // If this is a seteq make sure any build vectors of all zeros are on the RHS. - // This helps with vptestm matching. - // TODO: Should we just canonicalize the setcc during DAG combine? - if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) && - ISD::isBuildVectorAllZeros(Op0.getNode())) - std::swap(Op0, Op1); - // Prefer SETGT over SETLT. if (SetCCOpcode == ISD::SETLT) { SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); @@ -20007,7 +20592,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); + SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; @@ -20018,7 +20603,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // This is beneficial because materializing a constant 0 for the PCMPEQ is // probably cheaper than XOR+PCMPGT using 2 different vector constants: // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 - SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); + SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true); if (!UGEOp1) return SDValue(); Op1 = Op0; @@ -20086,14 +20671,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CC0, dl, MVT::i8)); + DAG.getTargetConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CC1, dl, MVT::i8)); + DAG.getTargetConstant(CC1, dl, MVT::i8)); Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { // Handle all other FP comparisons here. Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, dl, MVT::i8)); + DAG.getTargetConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the @@ -20106,16 +20691,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } MVT VTOp0 = Op0.getSimpleValueType(); + (void)VTOp0; assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); - // This is being called by type legalization because v2i32 is marked custom - // for result type legalization for v2f32. - if (VTOp0 == MVT::v2i32) - return SDValue(); - // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && @@ -20153,7 +20734,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CmpMode, dl, MVT::i8)); + DAG.getTargetConstant(CmpMode, dl, MVT::i8)); } // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. @@ -20222,21 +20803,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. - if (Cond == ISD::SETUGT && - ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { - return !C->getAPIntValue().isMaxValue(); - })) { + if (Cond == ISD::SETUGT) { // X > C --> X >= (C+1) --> X == umax(X, C+1) - Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT)); - Cond = ISD::SETUGE; + if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) { + Op1 = UGTOp1; + Cond = ISD::SETUGE; + } } - if (Cond == ISD::SETULT && - ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { - return !C->getAPIntValue().isNullValue(); - })) { + if (Cond == ISD::SETULT) { // X < C --> X <= (C-1) --> X == umin(X, C-1) - Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT)); - Cond = ISD::SETULE; + if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) { + Op1 = ULTOp1; + Cond = ISD::SETULE; + } } bool Invert = false; unsigned Opc; @@ -20360,11 +20939,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, return Result; } -// Try to select this as a KORTEST+SETCC if possible. -static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SDValue &X86CC) { +// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible. +static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue &X86CC) { // Only support equality comparisons. if (CC != ISD::SETEQ && CC != ISD::SETNE) return SDValue(); @@ -20389,6 +20968,21 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, } else return SDValue(); + // If the input is an AND, we can combine it's operands into the KTEST. + bool KTestable = false; + if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) + KTestable = true; + if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)) + KTestable = true; + if (!isNullConstant(Op1)) + KTestable = false; + if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) { + SDValue LHS = Op0.getOperand(0); + SDValue RHS = Op0.getOperand(1); + X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); + return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS); + } + // If the input is an OR, we can combine it's operands into the KORTEST. SDValue LHS = Op0; SDValue RHS = Op0; @@ -20397,7 +20991,7 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, RHS = Op0.getOperand(1); } - X86CC = DAG.getConstant(X86Cond, dl, MVT::i8); + X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); } @@ -20425,9 +21019,9 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, return PTEST; } - // Try to lower using KORTEST. - if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) - return KORTEST; + // Try to lower using KORTEST or KTEST. + if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) + return Test; // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. @@ -20442,7 +21036,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, if (Invert) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - X86CC = DAG.getConstant(CCode, dl, MVT::i8); + X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8); } return Op0.getOperand(1); @@ -20456,7 +21050,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - X86CC = DAG.getConstant(CondCode, dl, MVT::i8); + X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; } @@ -20472,6 +21066,19 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + // Handle f128 first, since one possible outcome is a normal integer + // comparison which gets handled by emitFlagsForSetcc. + if (Op0.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1); + + // If softenSetCCOperands returned a scalar, use it. + if (!Op1.getNode()) { + assert(Op0.getValueType() == Op.getValueType() && + "Unexpected setcc expansion!"); + return Op0; + } + } + SDValue X86CC; SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); if (!EFLAGS) @@ -20612,15 +21219,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, - CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); + SDValue Cmp = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, + DAG.getTargetConstant(SSECC, DL, MVT::i8)); assert(!VT.isVector() && "Not a scalar type?"); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, - DAG.getConstant(SSECC, DL, MVT::i8)); + DAG.getTargetConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. @@ -20718,8 +21326,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); - unsigned CondCode = - cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + unsigned CondCode = Cond.getConstantOperandVal(0); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { @@ -20807,8 +21414,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - MVT VT = Op.getSimpleValueType(); - bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? @@ -20826,7 +21431,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); - CC = DAG.getConstant(X86Cond, DL, MVT::i8); + CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); AddTest = false; } @@ -20848,7 +21453,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (AddTest) { - CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()), X86::COND_NE, DL, DAG); } @@ -20864,9 +21469,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { - SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, DL, MVT::i8), - Cond); + SDValue Res = + DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; @@ -21037,8 +21642,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. if (Subtarget.hasAVX()) { assert(VT.is256BitVector() && "256-bit vector expected"); - int HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + int HalfNumElts = HalfVT.getVectorNumElements(); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef); @@ -21081,7 +21686,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, - DAG.getConstant(SignExtShift, dl, MVT::i8)); + DAG.getTargetConstant(SignExtShift, dl, MVT::i8)); } if (VT == MVT::v2i64) { @@ -21119,7 +21724,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) + if (VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -21138,10 +21743,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // for v4i32 the high shuffle mask will be {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT - - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - + MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); unsigned NumElems = InVT.getVectorNumElements(); @@ -21165,7 +21767,7 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). - if (Store->isVolatile()) + if (!Store->isSimple()) return SDValue(); MVT StoreVT = StoredVal.getSimpleValueType(); @@ -21201,7 +21803,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). - if (Store->isVolatile()) + if (!Store->isSimple()) return SDValue(); MVT StoreSVT = StoreVT.getScalarType(); @@ -21266,14 +21868,13 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); - if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != - TargetLowering::TypeWidenVector) - return SDValue(); + assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == + TargetLowering::TypeWidenVector && "Unexpected type action!"); - MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), - StoreVT.getVectorNumElements() * 2); + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); @@ -21313,11 +21914,10 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); SDLoc dl(Ld); - EVT MemVT = Ld->getMemoryVT(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { - assert(EVT(RegVT) == MemVT && "Expected non-extending load"); + assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); @@ -21336,176 +21936,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } - // Nothing useful we can do without SSE2 shuffles. - assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned RegSz = RegVT.getSizeInBits(); - - ISD::LoadExtType Ext = Ld->getExtensionType(); - - assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) - && "Only anyext and sext are currently implemented."); - assert(MemVT != RegVT && "Cannot extend to the same type"); - assert(MemVT.isVector() && "Must load a vector from memory"); - - unsigned NumElems = RegVT.getVectorNumElements(); - unsigned MemSz = MemVT.getSizeInBits(); - assert(RegSz > MemSz && "Register size must be greater than the mem size"); - - if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { - // The only way in which we have a legal 256-bit vector result but not the - // integer 256-bit operations needed to directly lower a sextload is if we - // have AVX1 but not AVX2. In that case, we can always emit a sextload to - // a 128-bit vector and a normal sign_extend to 256-bits that should get - // correctly legalized. We do this late to allow the canonical form of - // sextload to persist throughout the rest of the DAG combiner -- it wants - // to fold together any extensions it can, and so will fuse a sign_extend - // of an sextload into a sextload targeting a wider value. - SDValue Load; - if (MemSz == 128) { - // Just switch this to a normal load. - assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " - "it must be a legal 128-bit vector " - "type!"); - Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - } else { - assert(MemSz < 128 && - "Can't extend a type wider than 128 bits to a 256 bit vector!"); - // Do an sext load to a 128-bit vector type. We want to use the same - // number of elements, but elements half as wide. This will end up being - // recursively lowered by this routine, but will succeed as we definitely - // have all the necessary features if we're using AVX1. - EVT HalfEltVT = - EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); - EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); - Load = - DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - } - - // Replace chain users with the new chain. - assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); - - // Finally, do a normal sign-extend to the desired register. - SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT); - return DAG.getMergeValues({SExt, Load.getValue(1)}, dl); - } - - // All sizes must be a power of two. - assert(isPowerOf2_32(RegSz * MemSz * NumElems) && - "Non-power-of-two elements are not custom lowered!"); - - // Attempt to load the original value using scalar loads. - // Find the largest scalar type that divides the total loaded size. - MVT SclrLoadTy = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { - SclrLoadTy = Tp; - } - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && - (64 <= MemSz)) - SclrLoadTy = MVT::f64; - - // Calculate the number of scalar loads that we need to perform - // in order to load our vector from memory. - unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); - - assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && - "Can only lower sext loads with a single scalar load!"); - - unsigned loadRegSize = RegSz; - if (Ext == ISD::SEXTLOAD && RegSz >= 256) - loadRegSize = 128; - - // If we don't have BWI we won't be able to create the shuffle needed for - // v8i8->v8i64. - if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && - MemVT == MVT::v8i8) - loadRegSize = 128; - - // Represent our vector as a sequence of elements which are the - // largest scalar that we can load. - EVT LoadUnitVecVT = EVT::getVectorVT( - *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits()); - - // Represent the data using the same element type that is stored in - // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegSize / MemVT.getScalarSizeInBits()); - - assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && - "Invalid vector type"); - - // We can't shuffle using an illegal type. - assert(TLI.isTypeLegal(WideVecVT) && - "We only lower types that form legal widened vector types"); - - SmallVector<SDValue, 8> Chains; - SDValue Ptr = Ld->getBasePtr(); - unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8; - SDValue Increment = DAG.getConstant(OffsetInc, dl, - TLI.getPointerTy(DAG.getDataLayout())); - SDValue Res = DAG.getUNDEF(LoadUnitVecVT); - - unsigned Offset = 0; - for (unsigned i = 0; i < NumLoads; ++i) { - unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset); - - // Perform a single load. - SDValue ScalarLoad = - DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo().getWithOffset(Offset), - NewAlign, Ld->getMemOperand()->getFlags()); - Chains.push_back(ScalarLoad.getValue(1)); - // Create the first element type using SCALAR_TO_VECTOR in order to avoid - // another round of DAGCombining. - if (i == 0) - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); - else - Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, - ScalarLoad, DAG.getIntPtrConstant(i, dl)); - - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - Offset += OffsetInc; - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); - - // Bitcast the loaded value to a vector of the original element type, in - // the size of the target vector type. - SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); - unsigned SizeRatio = RegSz / MemSz; - - if (Ext == ISD::SEXTLOAD) { - SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); - return DAG.getMergeValues({Sext, TF}, dl); - } - - if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && - MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); - return DAG.getMergeValues({Sext, TF}, dl); - } - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio] = i; - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), ShuffleVec); - - // Bitcast to the requested type. - Shuff = DAG.getBitcast(RegVT, Shuff); - return DAG.getMergeValues({Shuff, TF}, dl); + return SDValue(); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes @@ -21610,7 +22041,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Inverted) X86Cond = X86::GetOppositeBranchCondition(X86Cond); - CC = DAG.getConstant(X86Cond, dl, MVT::i8); + CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; @@ -21638,10 +22069,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { - X86::CondCode CCode = - (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); - CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, dl, MVT::i8); + X86::CondCode CCode0 = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode0 = X86::GetOppositeBranchCondition(CCode0); + CC = DAG.getTargetConstant(CCode0, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order @@ -21654,12 +22085,12 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { (void)NewBR; Dest = FalseBB; - Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), - Chain, Dest, CC, Cmp); - X86::CondCode CCode = - (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); - CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, dl, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, + Dest, CC, Cmp); + X86::CondCode CCode1 = + (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); + CCode1 = X86::GetOppositeBranchCondition(CCode1); + CC = DAG.getTargetConstant(CCode1, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -21672,7 +22103,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, dl, MVT::i8); + CC = DAG.getTargetConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && @@ -21698,10 +22129,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); - CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -21714,10 +22145,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); - CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -21742,7 +22173,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; - CC = DAG.getConstant(X86Cond, dl, MVT::i8); + CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()), X86Cond, dl, DAG); } @@ -21770,7 +22201,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned Align = Op.getConstantOperandVal(2); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack @@ -21811,7 +22242,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); - unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); + Register Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); @@ -21821,7 +22252,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned SPReg = RegInfo->getStackRegister(); + Register SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -22076,7 +22507,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, } return DAG.getNode(Opc, dl, VT, SrcOp, - DAG.getConstant(ShiftAmt, dl, MVT::i8)); + DAG.getTargetConstant(ShiftAmt, dl, MVT::i8)); } /// Handle vector element shifts where the shift amount may or may not be a @@ -22121,7 +22552,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), MVT::v2i64, ShAmt); else { - SDValue ByteShift = DAG.getConstant( + SDValue ByteShift = DAG.getTargetConstant( (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt); ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, @@ -22308,13 +22739,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) - return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; + return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; return false; }; auto isRoundModeSAE = [](SDValue Rnd) { - if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) - return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC; + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) { + unsigned RC = C->getZExtValue(); + if (RC & X86::STATIC_ROUNDING::NO_EXC) { + // Clear the NO_EXC bit and check remaining bits. + RC ^= X86::STATIC_ROUNDING::NO_EXC; + // As a convenience we allow no other bits or explicitly + // current direction. + return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION; + } + } return false; }; @@ -22335,7 +22774,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, }; SDLoc dl(Op); - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned IntNo = Op.getConstantOperandVal(0); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { @@ -22411,9 +22850,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); - if (IntrData->Type == INTR_TYPE_3OP_IMM8) - Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); - // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -22666,7 +23102,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); SDValue CC = Op.getOperand(3); - CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -22685,7 +23120,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); - SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); + SDValue CC = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Cmp; @@ -22750,16 +23185,16 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned CondVal = Op.getConstantOperandVal(3); SDValue Sae = Op.getOperand(4); SDValue FCmp; if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + DAG.getTargetConstant(CondVal, dl, MVT::i8)); else if (isRoundModeSAE(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); + DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae); else return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes @@ -22819,9 +23254,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. - SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, - Op.getOperand(2), - DAG.getConstant(0xf, dl, MVT::i32)); + auto Round = cast<ConstantSDNode>(Op.getOperand(2)); + SDValue RoundingMode = + DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), RoundingMode); } @@ -22829,12 +23264,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. - SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, - Op.getOperand(3), - DAG.getConstant(0xf, dl, MVT::i32)); + auto Round = cast<ConstantSDNode>(Op.getOperand(3)); + SDValue RoundingMode = + DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), RoundingMode); } + case BEXTRI: { + assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode"); + + // The control is a TargetConstant, but we need to convert it to a + // ConstantSDNode. + uint64_t Imm = Op.getConstantOperandVal(2); + SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType()); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), Control); + } // ADC/ADCX/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); @@ -23165,6 +23610,61 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, MaskVT, Operation); return DAG.getMergeValues({Result0, Result1}, DL); } + case Intrinsic::x86_mmx_pslli_w: + case Intrinsic::x86_mmx_pslli_d: + case Intrinsic::x86_mmx_pslli_q: + case Intrinsic::x86_mmx_psrli_w: + case Intrinsic::x86_mmx_psrli_d: + case Intrinsic::x86_mmx_psrli_q: + case Intrinsic::x86_mmx_psrai_w: + case Intrinsic::x86_mmx_psrai_d: { + SDLoc DL(Op); + SDValue ShAmt = Op.getOperand(2); + // If the argument is a constant, convert it to a target constant. + if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) { + ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + Op.getOperand(0), Op.getOperand(1), ShAmt); + } + + unsigned NewIntrinsic; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_mmx_pslli_w: + NewIntrinsic = Intrinsic::x86_mmx_psll_w; + break; + case Intrinsic::x86_mmx_pslli_d: + NewIntrinsic = Intrinsic::x86_mmx_psll_d; + break; + case Intrinsic::x86_mmx_pslli_q: + NewIntrinsic = Intrinsic::x86_mmx_psll_q; + break; + case Intrinsic::x86_mmx_psrli_w: + NewIntrinsic = Intrinsic::x86_mmx_psrl_w; + break; + case Intrinsic::x86_mmx_psrli_d: + NewIntrinsic = Intrinsic::x86_mmx_psrl_d; + break; + case Intrinsic::x86_mmx_psrli_q: + NewIntrinsic = Intrinsic::x86_mmx_psrl_q; + break; + case Intrinsic::x86_mmx_psrai_w: + NewIntrinsic = Intrinsic::x86_mmx_psra_w; + break; + case Intrinsic::x86_mmx_psrai_d: + NewIntrinsic = Intrinsic::x86_mmx_psra_d; + break; + } + + // The vector shift intrinsics with scalars uses 32b shift amounts but + // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an + // MMX register. + ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(NewIntrinsic, DL, MVT::i32), + Op.getOperand(1), ShAmt); + + } } } @@ -23177,7 +23677,9 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); // If source is undef or we know it won't be used, use a zero vector @@ -23204,7 +23706,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), VT.getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -23238,7 +23742,9 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -23266,7 +23772,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = @@ -23435,8 +23943,7 @@ EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - + unsigned IntNo = Op.getConstantOperandVal(1); const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { @@ -23538,10 +24045,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. - SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), - DAG.getConstant(1, dl, Op->getValueType(1)), - DAG.getConstant(X86::COND_B, dl, MVT::i8), - SDValue(Result.getNode(), 1) }; + SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), + DAG.getConstant(1, dl, Op->getValueType(1)), + DAG.getTargetConstant(X86::COND_B, dl, MVT::i8), + SDValue(Result.getNode(), 1)}; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); // Return { result, isValid, chain }. @@ -23581,8 +24088,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, Scale, Chain, Subtarget); } case PREFETCH: { - SDValue Hint = Op.getOperand(6); - unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); + const APInt &HintVal = Op.getConstantOperandAPInt(6); assert((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"); unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); @@ -23678,7 +24184,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); - unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned Depth = Op.getConstantOperandVal(0); SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -23730,7 +24236,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful - unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned Depth = Op.getConstantOperandVal(0); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -23743,12 +24249,11 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { +Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const MachineFunction &MF = DAG.getMachineFunction(); - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<unsigned>(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Case("ebp", X86::EBP) @@ -23762,8 +24267,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, #ifndef NDEBUG else { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"); } @@ -23809,7 +24313,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); @@ -23967,6 +24471,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: + case CallingConv::Tail: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; @@ -24279,12 +24784,9 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, if (Opc == ISD::CTLZ) { // If src is zero (i.e. bsr sets ZF), returns NumBits. - SDValue Ops[] = { - Op, - DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), - DAG.getConstant(X86::COND_E, dl, MVT::i8), - Op.getValue(1) - }; + SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), + DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1)}; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); } @@ -24312,12 +24814,9 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); // If src is zero (i.e. bsf sets ZF), returns NumBits. - SDValue Ops[] = { - Op, - DAG.getConstant(NumBits, dl, VT), - DAG.getConstant(X86::COND_E, dl, MVT::i8), - Op.getValue(1) - }; + SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT), + DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1)}; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } @@ -24453,7 +24952,7 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SDValue N0 = Op.getOperand(0); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, VT), N0); - SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8), + SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8), SDValue(Neg.getNode(), 1)}; return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); } @@ -25033,7 +25532,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // Optimize shl/srl/sra with constant shift amount. APInt APIntShiftAmt; - if (!isConstantSplat(Amt, APIntShiftAmt)) + if (!X86::isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); // If the shift amount is out of range, return undef. @@ -25220,7 +25719,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, } ConstantSDNode *ND = cast<ConstantSDNode>(Op); - APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); + APInt C(SVTBits, ND->getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); @@ -25502,7 +26001,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, (VT == MVT::v32i8 && Subtarget.hasInt256())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); - SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8); + SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); // Extend constant shift amount to vXi16 (it doesn't matter if the type // isn't legal). @@ -25774,7 +26273,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); return DAG.getNode(Op, DL, VT, R, - DAG.getConstant(RotateAmt, DL, MVT::i8)); + DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); } // Else, fall-back on VPROLV/VPRORV. @@ -25795,7 +26294,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (0 <= CstSplatIndex) { uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); return DAG.getNode(X86ISD::VROTLI, DL, VT, R, - DAG.getConstant(RotateAmt, DL, MVT::i8)); + DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); } // Use general rotate by variable (per-element). @@ -26032,7 +26531,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // If this is a canonical idempotent atomicrmw w/no uses, we have a better // lowering available in lowerAtomicArith. - // TODO: push more cases through this path. + // TODO: push more cases through this path. if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand())) if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && AI->use_empty()) @@ -26087,10 +26586,22 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { return Loaded; } +bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const { + if (!SI.isUnordered()) + return false; + return ExperimentalUnorderedISEL; +} +bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const { + if (!LI.isUnordered()) + return false; + return ExperimentalUnorderedISEL; +} + + /// Emit a locked operation on a stack location which does not change any /// memory location, but does involve a lock prefix. Location is chosen to be /// a) very likely accessed only by a single thread to minimize cache traffic, -/// and b) definitely dereferenceable. Returns the new Chain result. +/// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, SDLoc DL) { @@ -26099,22 +26610,22 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG, // operations issued by the current processor. As such, the location // referenced is not relevant for the ordering properties of the instruction. // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, - // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions // 2) Using an immediate operand appears to be the best encoding choice // here since it doesn't require an extra register. // 3) OR appears to be very slightly faster than ADD. (Though, the difference // is small enough it might just be measurement noise.) // 4) When choosing offsets, there are several contributing factors: // a) If there's no redzone, we default to TOS. (We could allocate a cache - // line aligned stack object to improve this case.) + // line aligned stack object to improve this case.) // b) To minimize our chances of introducing a false dependence, we prefer - // to offset the stack usage from TOS slightly. + // to offset the stack usage from TOS slightly. // c) To minimize concerns about cross thread stack usage - in particular, // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which // captures state in the TOS frame and accesses it from many threads - // we want to use an offset such that the offset is in a distinct cache // line from the TOS frame. - // + // // For a general discussion of the tradeoffs and benchmark results, see: // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ @@ -26155,10 +26666,10 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG, static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); - AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( - cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); - SyncScope::ID FenceSSID = static_cast<SyncScope::ID>( - cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); + AtomicOrdering FenceOrdering = + static_cast<AtomicOrdering>(Op.getConstantOperandVal(1)); + SyncScope::ID FenceSSID = + static_cast<SyncScope::ID>(Op.getConstantOperandVal(2)); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. @@ -26167,7 +26678,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); - SDValue Chain = Op.getOperand(0); + SDValue Chain = Op.getOperand(0); return emitLockedStackOp(DAG, Subtarget, Chain, dl); } @@ -26218,6 +26729,17 @@ static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT InVT = V.getSimpleValueType(); + if (InVT == MVT::v64i8) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(V, DL); + Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget); + Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget); + Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); + Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); + Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, + DAG.getConstant(32, DL, MVT::i8)); + return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); + } if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(V, DL); @@ -26258,8 +26780,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); - EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(), - DstVT.getVectorNumElements() / 2); + MVT CastVT = DstVT.getHalfNumVectorElementsVT(); Lo = DAG.getBitcast(CastVT, Lo); Hi = DAG.getBitcast(CastVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); @@ -26275,53 +26796,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, return DAG.getZExtOrTrunc(V, DL, DstVT); } - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || - SrcVT == MVT::i64) { - assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - if (DstVT != MVT::f64 && DstVT != MVT::i64 && - !(DstVT == MVT::x86mmx && SrcVT.isVector())) - // This conversion needs to be expanded. - return SDValue(); + assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) && "Unexpected VT!"); - SDLoc dl(Op); - if (SrcVT.isVector()) { - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. - MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), - SrcVT.getVectorNumElements() * 2); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, - DAG.getUNDEF(SrcVT)); - } else { - assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && - "Unexpected source type in LowerBITCAST"); - Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); - } + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); + if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) && + !(DstVT == MVT::x86mmx && SrcVT.isVector())) + // This conversion needs to be expanded. + return SDValue(); - MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; - Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); + SDLoc dl(Op); + if (SrcVT.isVector()) { + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), + SrcVT.getVectorNumElements() * 2); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, + DAG.getUNDEF(SrcVT)); + } else { + assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && + "Unexpected source type in LowerBITCAST"); + Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); + } - if (DstVT == MVT::x86mmx) - return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); + MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; + Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, - DAG.getIntPtrConstant(0, dl)); - } + if (DstVT == MVT::x86mmx) + return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); - assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() && - Subtarget.hasMMX() && "Unexpected custom BITCAST"); - assert((DstVT == MVT::i64 || - (DstVT.isVector() && DstVT.getSizeInBits()==64)) && - "Unexpected custom BITCAST"); - // i64 <=> MMX conversions are Legal. - if (SrcVT==MVT::i64 && DstVT.isVector()) - return Op; - if (DstVT==MVT::i64 && SrcVT.isVector()) - return Op; - // MMX <=> MMX conversions are Legal. - if (SrcVT.isVector() && DstVT.isVector()) - return Op; - // All other conversions need to be expanded. - return SDValue(); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, + DAG.getIntPtrConstant(0, dl)); } /// Compute the horizontal sum of bytes in V for the elements of VT. @@ -26549,6 +27054,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SDValue In = Op.getOperand(0); SDLoc DL(Op); + // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB + // lowering. + if (VT == MVT::v8i64 || VT == MVT::v16i32) { + assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"); + return Lower512IntUnary(Op, DAG); + } + unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"); @@ -26656,12 +27168,12 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, // seq_cst which isn't SingleThread, everything just needs to be preserved // during codegen and then dropped. Note that we expect (but don't assume), // that orderings other than seq_cst and acq_rel have been canonicalized to - // a store or load. + // a store or load. if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && AN->getSyncScopeID() == SyncScope::System) { // Prefer a locked operation against a stack location to minimize cache // traffic. This assumes that stack locations are very likely to be - // accessed only by the owning thread. + // accessed only by the owning thread. SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. @@ -26886,12 +27398,13 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); - if (VT == MVT::v2f32) { + if (VT == MVT::v2f32 || VT == MVT::v2i32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32)); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( @@ -26901,30 +27414,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } - if (VT == MVT::v2i32) { - assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, - DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 and we have VLX we can use xmm for data and index. - if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { - SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); - SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; - SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( - VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - return SDValue(NewScatter.getNode(), 1); - } - // Custom widen all the operands to avoid promotion. - EVT NewIndexVT = EVT::getVectorVT( - *DAG.getContext(), Index.getValueType().getVectorElementType(), 4); - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(Index.getValueType())); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); - SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; - return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl, - Ops, N->getMemOperand()); - } - MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); @@ -27160,6 +27649,13 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, return NOOP; } +SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const { + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -27206,10 +27702,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: - case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget); + case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -27347,37 +27847,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::MUL: { EVT VT = N->getValueType(0); - assert(VT.isVector() && "Unexpected VT"); - if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger && - VT.getVectorNumElements() == 2) { - // Promote to a pattern that will be turned into PMULUDQ. - SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, - N->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, - N->getOperand(1)); - SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1); - Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); - } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - VT.getVectorElementType() == MVT::i8) { - // Pre-promote these to vXi16 to avoid op legalization thinking all 16 - // elements are needed. - MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); - SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); - SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); - SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); - Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); - unsigned NumConcats = 16 / VT.getVectorNumElements(); - SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); - ConcatOps[0] = Res; - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); - Results.push_back(Res); - } + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"); + // Pre-promote these to vXi16 to avoid op legalization thinking all 16 + // elements are needed. + MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + unsigned NumConcats = 16 / VT.getVectorNumElements(); + SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); + Results.push_back(Res); return; } - case ISD::UADDSAT: - case ISD::SADDSAT: - case ISD::USUBSAT: - case ISD::SSUBSAT: case X86ISD::VPMADDWD: case X86ISD::AVG: { // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and @@ -27388,6 +27873,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT InVT = N->getOperand(0).getValueType(); assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."); + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); unsigned NumConcat = 128 / InVT.getSizeInBits(); EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), @@ -27404,9 +27891,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, - DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } @@ -27435,26 +27919,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Hi); return; } - case ISD::SETCC: { - // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when - // setCC result type is v2i1 because type legalzation will end up with - // a v4i1 setcc plus an extend. - assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); - if (N->getOperand(0).getValueType() != MVT::v2f32 || - getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) - return; - SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); - SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(0), UNDEF); - SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(1), UNDEF); - SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, - N->getOperand(2)); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -27475,7 +27939,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SREM: case ISD::UREM: { EVT VT = N->getValueType(0); - if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) { + if (VT.isVector()) { + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); // If this RHS is a constant splat vector we can widen this and let // division/remainder by constant optimize it. // TODO: Can we do something for non-splat? @@ -27493,17 +27959,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (VT == MVT::v2i32) { - // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the - // v2i64 and unroll later. But then we create i64 scalar ops which - // might be slow in 64-bit mode or require a libcall in 32-bit mode. - Results.push_back(DAG.UnrollVectorOp(N)); - return; - } - - if (VT.isVector()) - return; - LLVM_FALLTHROUGH; } case ISD::SDIVREM: @@ -27561,58 +28016,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } } - return; - } - case ISD::SIGN_EXTEND_VECTOR_INREG: { - if (ExperimentalVectorWideningLegalization) - return; - - EVT VT = N->getValueType(0); - SDValue In = N->getOperand(0); - EVT InVT = In.getValueType(); - if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v16i16 || InVT == MVT::v32i8)) { - // Custom split this so we can extend i8/i16->i32 invec. This is better - // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using - // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting - // we allow the sra from the extend to i32 to be shared by the split. - EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), - InVT.getVectorElementType(), - InVT.getVectorNumElements() / 2); - MVT ExtendVT = MVT::getVectorVT(MVT::i32, - VT.getVectorNumElements()); - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT, - In, DAG.getIntPtrConstant(0, dl)); - In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In); - - // Fill a vector with sign bits for each element. - SDValue Zero = DAG.getConstant(0, dl, ExtendVT); - SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT); - - EVT LoVT, HiVT; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); - - // Create an unpackl and unpackh to interleave the sign bits then bitcast - // to vXi64. - SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits); - Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); - SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits); - Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); + if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 && + getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector && + isTypeLegal(MVT::v4i64)) { + // Input needs to be split and output needs to widened. Let's use two + // VTRUNCs, and shuffle their results together into the wider type. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, dl); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); + SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, + { 0, 1, 2, 3, 16, 17, 18, 19, + -1, -1, -1, -1, -1, -1, -1, -1 }); Results.push_back(Res); return; } + return; } + case ISD::ANY_EXTEND: + // Right now, only MVT::v8i8 has Custom action for an illegal type. + // It's intended to custom handle the input type. + assert(N->getValueType(0) == MVT::v8i8 && + "Do not know how to legalize this Node"); + return; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8) && - getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { + (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ + assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && + "Unexpected type action!"); assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using @@ -27683,27 +28120,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); - // Promote these manually to avoid over promotion to v2i64. Type - // legalization will revisit the v2i32 operation for more cleanup. - if ((VT == MVT::v2i8 || VT == MVT::v2i16) && - getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { - // AVX512DQ provides instructions that produce a v2i64 result. - if (Subtarget.hasDQI()) - return; - - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src); - Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext - : ISD::AssertSext, - dl, MVT::v2i32, Res, - DAG.getValueType(VT.getVectorElementType())); - Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); - Results.push_back(Res); - return; - } - if (VT.isVector() && VT.getScalarSizeInBits() < 32) { - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - return; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); // Try to create a 128 bit vector, but don't exceed a 32 bit element. unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); @@ -27738,35 +28157,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - bool Widenv2i32 = - getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { - // If v2i32 is widened, we can defer to the generic legalizer. - if (Widenv2i32) - return; - // Custom widen by doubling to a legal vector with. Isel will - // further widen to v8f64. - Opc = ISD::FP_TO_UINT; - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, - Src, DAG.getUNDEF(MVT::v2f64)); + // If we have VLX we can emit a target specific FP_TO_UINT node, + // otherwise we can defer to the generic legalizer which will widen + // the input as well. This will be further widened during op + // legalization to v8i32<-v8f64. + return; } + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); - if (!Widenv2i32) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } - if (SrcVT == MVT::v2f32 && - getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { - SDValue Idx = DAG.getIntPtrConstant(0, dl); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32)); - Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT - : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -27776,6 +28178,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } + assert(!VT.isVector() && "Vectors should have been handled above!"); + if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); @@ -27847,7 +28251,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + unsigned IntNo = N->getConstantOperandVal(1); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); @@ -27905,7 +28309,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - unsigned BasePtr = TRI->getBaseRegister(); + Register BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); if (TRI->hasBasePointer(DAG.getMachineFunction()) && (BasePtr == X86::RBX || BasePtr == X86::EBX)) { @@ -28060,34 +28464,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (SrcVT != MVT::f64 || - (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || - getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) + if (DstVT.isVector() && SrcVT == MVT::x86mmx) { + assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && + "Unexpected type action!"); + EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); + SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0)); + Results.push_back(Res); return; + } - unsigned NumElts = DstVT.getVectorNumElements(); - EVT SVT = DstVT.getVectorElementType(); - EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); - SDValue Res; - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); - Res = DAG.getBitcast(WiderVT, Res); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); - if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if ((VT == MVT::v2f32 || VT == MVT::v2i32) && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast<MaskedGatherSDNode>(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); + EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Gather->getPassThru(), - DAG.getUNDEF(MVT::v2f32)); + DAG.getUNDEF(VT)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. @@ -28098,66 +28501,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, + DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); return; } - if (VT == MVT::v2i32) { - auto *Gather = cast<MaskedGatherSDNode>(N); - SDValue Index = Gather->getIndex(); - SDValue Mask = Gather->getMask(); - assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, - Gather->getPassThru(), - DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 we can use it directly. - if (Index.getValueType() == MVT::v2i64 && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { - if (!Subtarget.hasVLX()) { - // We need to widen the mask, but the instruction will only use 2 - // of its elements. So we can use undef. - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getUNDEF(MVT::v2i1)); - Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); - } - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, - Gather->getMemoryVT(), Gather->getMemOperand()); - SDValue Chain = Res.getValue(2); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; - } - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { - EVT IndexVT = Index.getValueType(); - EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getScalarType(), 4); - // Otherwise we need to custom widen everything to avoid promotion. - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(IndexVT)); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), - Gather->getMemoryVT(), dl, Ops, - Gather->getMemOperand()); - SDValue Chain = Res.getValue(1); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; - } - } return; } case ISD::LOAD: { @@ -28166,8 +28515,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // cast since type legalization will try to use an i64 load. MVT VT = N->getSimpleValueType(0); assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - return; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast<LoadSDNode>(N); @@ -28177,11 +28526,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - MVT WideVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); - MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() * 2); - Res = DAG.getBitcast(CastVT, Res); + MVT VecVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res); + EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); + Res = DAG.getBitcast(WideVT, Res); Results.push_back(Res); Results.push_back(Chain); return; @@ -28236,6 +28584,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; + case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ"; case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; @@ -28373,6 +28722,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -28737,6 +29087,9 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0))) + return false; + EVT SrcVT = ExtVal.getOperand(0).getValueType(); // There is no extending load for vXi1. @@ -28856,10 +29209,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - unsigned mainDstReg = MRI.createVirtualRegister(RC); - unsigned fallDstReg = MRI.createVirtualRegister(RC); + Register mainDstReg = MRI.createVirtualRegister(RC); + Register fallDstReg = MRI.createVirtualRegister(RC); // thisMBB: // xbegin fallMBB @@ -28913,7 +29266,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, static_assert(X86::AddrNumOperands == 5, "VAARG_64 assumes 5 address operands"); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); MachineOperand &Base = MI.getOperand(1); MachineOperand &Scale = MI.getOperand(2); MachineOperand &Index = MI.getOperand(3); @@ -29049,7 +29402,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, assert(OffsetReg != 0); // Read the reg_save_area address. - unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); + Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) .add(Base) .add(Scale) @@ -29059,8 +29412,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .setMemRefs(LoadOnlyMMO); // Zero-extend the offset - unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); - BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); @@ -29071,7 +29424,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addReg(RegSaveReg); // Compute the offset for the next argument - unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); + Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); @@ -29096,7 +29449,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // // Load the overflow_area address into a register. - unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); + Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) .add(Base) .add(Scale) @@ -29110,7 +29463,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, if (NeedsAlign) { // Align the overflow address assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); - unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); + Register TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) @@ -29127,7 +29480,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) - unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); + Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); @@ -29191,7 +29544,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned CountReg = MI.getOperand(0).getReg(); + Register CountReg = MI.getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); @@ -29273,7 +29626,9 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CMOV_FR32: + case X86::CMOV_FR32X: case X86::CMOV_FR64: + case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -29326,9 +29681,9 @@ static MachineInstrBuilder createPHIsForCMOVsInSinkBB( MachineInstrBuilder MIB; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned Op1Reg = MIIt->getOperand(1).getReg(); - unsigned Op2Reg = MIIt->getOperand(2).getReg(); + Register DestReg = MIIt->getOperand(0).getReg(); + Register Op1Reg = MIIt->getOperand(1).getReg(); + Register Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the @@ -29486,9 +29841,9 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] - unsigned DestReg = FirstCMOV.getOperand(0).getReg(); - unsigned Op1Reg = FirstCMOV.getOperand(1).getReg(); - unsigned Op2Reg = FirstCMOV.getOperand(2).getReg(); + Register DestReg = FirstCMOV.getOperand(0).getReg(); + Register Op1Reg = FirstCMOV.getOperand(1).getReg(); + Register Op2Reg = FirstCMOV.getOperand(2).getReg(); MachineInstrBuilder MIB = BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) @@ -30006,7 +30361,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, // call the retpoline thunk. DebugLoc DL = MI.getDebugLoc(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - unsigned CalleeVReg = MI.getOperand(0).getReg(); + Register CalleeVReg = MI.getOperand(0).getReg(); unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); // Find an available scratch register to hold the callee. On 64-bit, we can @@ -30079,7 +30434,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, // Initialize a register with zero. MVT PVT = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *PtrRC = getRegClassFor(PVT); - unsigned ZReg = MRI.createVirtualRegister(PtrRC); + Register ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(*MBB, MI, DL, TII->get(XorRROpc)) .addDef(ZReg) @@ -30087,7 +30442,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. - unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC); + Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); @@ -30131,8 +30486,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); (void)TRI; - unsigned mainDstReg = MRI.createVirtualRegister(RC); - unsigned restoreDstReg = MRI.createVirtualRegister(RC); + Register mainDstReg = MRI.createVirtualRegister(RC); + Register restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; @@ -30246,8 +30601,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); X86FI->setRestoreBasePointer(MF); - unsigned FramePtr = RegInfo->getFrameRegister(*MF); - unsigned BasePtr = RegInfo->getBaseRegister(); + Register FramePtr = RegInfo->getFrameRegister(*MF); + Register BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) @@ -30329,7 +30684,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MBB->addSuccessor(checkSspMBB); // Initialize a register with zero. - unsigned ZReg = MRI.createVirtualRegister(PtrRC); + Register ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(checkSspMBB, DL, TII->get(XorRROpc)) .addDef(ZReg) @@ -30337,7 +30692,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. - unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC); + Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); @@ -30352,7 +30707,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, checkSspMBB->addSuccessor(fallMBB); // Reload the previously saved SSP register value. - unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC); + Register PrevSSPReg = MRI.createVirtualRegister(PtrRC); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; const int64_t SPPOffset = 3 * PVT.getStoreSize(); MachineInstrBuilder MIB = @@ -30370,7 +30725,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MIB.setMemRefs(MMOs); // Subtract the current SSP from the previous SSP. - unsigned SspSubReg = MRI.createVirtualRegister(PtrRC); + Register SspSubReg = MRI.createVirtualRegister(PtrRC); unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr; BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg) .addReg(PrevSSPReg) @@ -30384,7 +30739,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8. unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri; unsigned Offset = (PVT == MVT::i64) ? 3 : 2; - unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC); + Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg) .addReg(SspSubReg) .addImm(Offset); @@ -30394,7 +30749,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg); // Reset the lower 8 bits. - unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC); + Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg) .addReg(SspFirstShrReg) .addImm(8); @@ -30406,12 +30761,12 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, // Do a single shift left. unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1; - unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC); + Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg) .addReg(SspSecondShrReg); // Save the value 128 to a register (will be used next with incssp). - unsigned Value128InReg = MRI.createVirtualRegister(PtrRC); + Register Value128InReg = MRI.createVirtualRegister(PtrRC); unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri; BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg) .addImm(128); @@ -30419,8 +30774,8 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, // Since incssp only looks at the lower 8 bits, we might need to do several // iterations of incssp until we finish fixing the shadow stack. - unsigned DecReg = MRI.createVirtualRegister(PtrRC); - unsigned CounterReg = MRI.createVirtualRegister(PtrRC); + Register DecReg = MRI.createVirtualRegister(PtrRC); + Register CounterReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg) .addReg(SspAfterShlReg) .addMBB(fixShadowLoopPrepareMBB) @@ -30460,11 +30815,11 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; - unsigned Tmp = MRI.createVirtualRegister(RC); + Register Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; - unsigned SP = RegInfo->getStackRegister(); + Register SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; @@ -30662,8 +31017,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>(); MFI->setRestoreBasePointer(MF); - unsigned FP = RI.getFrameRegister(*MF); - unsigned BP = RI.getBaseRegister(); + Register FP = RI.getFrameRegister(*MF); + Register BP = RI.getBaseRegister(); unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, MFI->getRestoreBasePointerOffset()) @@ -30674,7 +31029,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, } // IReg is used as an index in a memory operand and therefore can't be SP - unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); + Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, Subtarget.is64Bit() ? 8 : 4); BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) @@ -30683,8 +31038,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); if (Subtarget.is64Bit()) { - unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); - unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); // leaq .LJTI0_0(%rip), BReg BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg) @@ -30710,9 +31065,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(0); break; case MachineJumpTableInfo::EK_LabelDifference32: { - unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass); - unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); - unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); + Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass); // movl (BReg,IReg64,4), OReg BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg) @@ -30783,8 +31138,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, DefRegs[MOp.getReg()] = true; MachineInstrBuilder MIB(*MF, &II); - for (unsigned RI = 0; SavedRegs[RI]; ++RI) { - unsigned Reg = SavedRegs[RI]; + for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) { + unsigned Reg = SavedRegs[RegIdx]; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } @@ -30906,20 +31261,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); // Load the old value of the control word... - unsigned OldCW = - MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), OrigCWFrameIdx); // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. - unsigned NewCW = - MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) .addReg(OldCW, RegState::Kill).addImm(0xC00); // Extract to 16 bits. - unsigned NewCW16 = - MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); + Register NewCW16 = + MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) .addReg(NewCW, RegState::Kill, X86::sub_16bit); @@ -31023,7 +31376,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineRegisterInfo &MRI = MF->getRegInfo(); MVT SPTy = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); - unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); + Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); X86AddressMode AM = getAddressFromInstr(&MI, 0); // Regalloc does not need any help when the memory operand of CMPXCHG8B @@ -31034,10 +31387,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its // four operand definitions that are E[ABCD] registers. We skip them and // then insert the LEA. - MachineBasicBlock::iterator MBBI(MI); - while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) || - MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX)) - --MBBI; + MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator()); + while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) || + RMBBI->definesRegister(X86::EBX) || + RMBBI->definesRegister(X86::ECX) || + RMBBI->definesRegister(X86::EDX))) { + ++RMBBI; + } + MachineBasicBlock::iterator MBBI(RMBBI); addFullAddress( BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM); @@ -31232,12 +31589,21 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.One |= Known2.One; break; } + case X86ISD::PSADBW: { + assert(VT.getScalarType() == MVT::i64 && + Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && + "Unexpected PSADBW types"); + + // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result. + Known.Zero.setBitsFrom(16); + break; + } case X86ISD::CMOV: { - Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1); + Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); // If we don't know any bits, early out. if (Known.isUnknown()) break; - KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1); + KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); // Only known if known in both the LHS and RHS. Known.One &= Known2.One; @@ -31650,8 +32016,8 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { - ArrayRef<int> LoMask(Mask.data() + 0, 4); - ArrayRef<int> HiMask(Mask.data() + 4, 4); + ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4); + ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4); // PSHUFLW: permute lower 4 elements only. if (isUndefOrInRange(LoMask, 0, 4) && @@ -31789,8 +32155,8 @@ static bool matchBinaryPermuteShuffle( uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); - if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, - BlendMask)) { + if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, + ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector<int, 8> RepeatedMask; @@ -31819,15 +32185,15 @@ static bool matchBinaryPermuteShuffle( } } - // Attempt to combine to INSERTPS. + // Attempt to combine to INSERTPS, but only if it has elements that need to + // be set to zero. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && - MaskVT.is128BitVector()) { - if (Zeroable.getBoolValue() && - matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { - Shuffle = X86ISD::INSERTPS; - ShuffleVT = MVT::v4f32; - return true; - } + MaskVT.is128BitVector() && + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) && + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; } // Attempt to combine to SHUFPD. @@ -31835,7 +32201,11 @@ static bool matchBinaryPermuteShuffle( ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + bool ForceV1Zero = false, ForceV2Zero = false; + if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, + PermuteImm, Mask, Zeroable)) { + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; @@ -31889,6 +32259,15 @@ static bool matchBinaryPermuteShuffle( } } + // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed. + if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && + MaskVT.is128BitVector() && + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; + } + return false; } @@ -31942,7 +32321,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || - (RootVT.isFloatingPoint() && Depth >= 2) || + (RootVT.isFloatingPoint() && Depth >= 1) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size @@ -31981,7 +32360,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) + if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); unsigned PermMask = 0; @@ -31991,7 +32370,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Res = DAG.getBitcast(ShuffleVT, V1); Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, DAG.getUNDEF(ShuffleVT), - DAG.getConstant(PermMask, DL, MVT::i8)); + DAG.getTargetConstant(PermMask, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -32026,8 +32405,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. // TODO: Should we indicate which domain is preferred if both are allowed? - bool AllowFloatDomain = FloatDomain || (Depth > 3); - bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && + bool AllowFloatDomain = FloatDomain || (Depth >= 3); + bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. @@ -32062,14 +32441,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && MayFoldLoad(V1.getOperand(0))) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } if (Subtarget.hasAVX2()) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = DAG.getBitcast(MaskVT, V1); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); @@ -32083,7 +32462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleSrcVT, NewV1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); @@ -32094,11 +32473,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, - DAG.getConstant(PermuteImm, DL, MVT::i8)); + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } @@ -32109,7 +32488,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1); NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2); @@ -32123,12 +32502,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleVT, NewV1); NewV2 = DAG.getBitcast(ShuffleVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, - DAG.getConstant(PermuteImm, DL, MVT::i8)); + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -32141,34 +32520,34 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) + if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) + if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); V2 = DAG.getBitcast(IntMaskVT, V2); Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. - if (Depth < 2) + if (Depth < 1) return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. - int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; + int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2; AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = @@ -32321,7 +32700,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, V2 = DAG.getBitcast(MaskVT, V2); SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, - DAG.getConstant(M2ZImm, DL, MVT::i8)); + DAG.getTargetConstant(M2ZImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -32650,7 +33029,7 @@ static SDValue combineX86ShufflesRecursively( // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. const unsigned MaxRecursionDepth = 8; - if (Depth > MaxRecursionDepth) + if (Depth >= MaxRecursionDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. @@ -32667,11 +33046,18 @@ static SDValue combineX86ShufflesRecursively( "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. + // TODO - determine Op's demanded elts from RootMask. SmallVector<int, 64> OpMask; SmallVector<SDValue, 2> OpInputs; - if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) + APInt OpUndef, OpZero; + APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); + if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, + OpZero, DAG, Depth, false)) return SDValue(); + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); + // Add the inputs to the Ops list, avoiding duplicates. SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); @@ -32772,6 +33158,9 @@ static SDValue combineX86ShufflesRecursively( Mask[i] = OpMaskedIdx; } + // Remove unused/repeated shuffle source ops. + resolveTargetShuffleInputsAndMask(Ops, Mask); + // Handle the all undef/zero cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(Root.getValueType()); @@ -32783,11 +33172,8 @@ static SDValue combineX86ShufflesRecursively( return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); - // Remove unused/repeated shuffle source ops. - resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); - - HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); + HasVariableMask |= IsOpVariableMask; // Update the list of shuffle nodes that have been combined so far. SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(), @@ -32853,7 +33239,7 @@ static SDValue combineX86ShufflesRecursively( /// Helper entry wrapper to combineX86ShufflesRecursively. static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, + return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget); } @@ -33088,7 +33474,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, for (unsigned i = 0; i != Scale; ++i) DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( - {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1, + {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -33120,6 +33506,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, VT.getSizeInBits()); } + // vbroadcast(scalarload X) -> vbroadcast_load X + // For float loads, extract other uses of the scalar from the broadcast. + if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) && + ISD::isNormalLoad(Src.getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(Src); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceExtract = Src.hasOneUse(); + DCI.CombineTo(N.getNode(), BcastLd); + if (NoReplaceExtract) { + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + } else { + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd, + DAG.getIntPtrConstant(0, DL)); + DCI.CombineTo(LN, Scl, BcastLd.getValue(1)); + } + return N; // Return N so it doesn't get rechecked! + } + return SDValue(); } case X86ISD::BLENDI: { @@ -33133,14 +33543,14 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, MVT SrcVT = N0.getOperand(0).getSimpleValueType(); if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && SrcVT.getScalarSizeInBits() >= 32) { - unsigned Mask = N.getConstantOperandVal(2); + unsigned BlendMask = N.getConstantOperandVal(2); unsigned Size = VT.getVectorNumElements(); unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); - unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), N1.getOperand(0), - DAG.getConstant(ScaleMask, DL, MVT::i8))); + DAG.getTargetConstant(BlendMask, DL, MVT::i8))); } } return SDValue(); @@ -33208,76 +33618,97 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, // If we zero out all elements from Op0 then we don't need to reference it. if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); // If we zero out the element from Op1 then we don't need to reference it. if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector<int, 8> TargetMask1; SmallVector<SDValue, 2> Ops1; - if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { - int M = TargetMask1[SrcIdx]; - if (isUndefOrZero(M)) { + APInt KnownUndef1, KnownZero1; + if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1, + KnownZero1)) { + if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) { // Zero/UNDEF insertion - zero out element and remove dependency. InsertPSMask |= (1u << DstIdx); return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Update insertps mask srcidx and reference the source input directly. + int M = TargetMask1[SrcIdx]; assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector<int, 8> TargetMask0; SmallVector<SDValue, 2> Ops0; - if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) - return SDValue(); + APInt KnownUndef0, KnownZero0; + if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0, + KnownZero0)) { + bool Updated = false; + bool UseInput00 = false; + bool UseInput01 = false; + for (int i = 0; i != 4; ++i) { + if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { + // No change if element is already zero or the inserted element. + continue; + } else if (KnownUndef0[i] || KnownZero0[i]) { + // If the target mask is undef/zero then we must zero the element. + InsertPSMask |= (1u << i); + Updated = true; + continue; + } - bool Updated = false; - bool UseInput00 = false; - bool UseInput01 = false; - for (int i = 0; i != 4; ++i) { - int M = TargetMask0[i]; - if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { - // No change if element is already zero or the inserted element. - continue; - } else if (isUndefOrZero(M)) { - // If the target mask is undef/zero then we must zero the element. - InsertPSMask |= (1u << i); - Updated = true; - continue; + // The input vector element must be inline. + int M = TargetMask0[i]; + if (M != i && M != (i + 4)) + return SDValue(); + + // Determine which inputs of the target shuffle we're using. + UseInput00 |= (0 <= M && M < 4); + UseInput01 |= (4 <= M); } - // The input vector element must be inline. - if (M != i && M != (i + 4)) - return SDValue(); + // If we're not using both inputs of the target shuffle then use the + // referenced input directly. + if (UseInput00 && !UseInput01) { + Updated = true; + Op0 = Ops0[0]; + } else if (!UseInput00 && UseInput01) { + Updated = true; + Op0 = Ops0[1]; + } - // Determine which inputs of the target shuffle we're using. - UseInput00 |= (0 <= M && M < 4); - UseInput01 |= (4 <= M); + if (Updated) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } - // If we're not using both inputs of the target shuffle then use the - // referenced input directly. - if (UseInput00 && !UseInput01) { - Updated = true; - Op0 = Ops0[0]; - } else if (!UseInput00 && UseInput01) { - Updated = true; - Op0 = Ops0[1]; + // If we're inserting an element from a vbroadcast load, fold the + // load into the X86insertps instruction. We need to convert the scalar + // load to a vector and clear the source lane of the INSERTPS control. + if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op1); + if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { + SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getMemOperand()); + SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, + Load), + DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Insert; + } } - if (Updated) - return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); - return SDValue(); } default: @@ -33580,7 +34011,7 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, } /// Eliminate a redundant shuffle of a horizontal math op. -static SDValue foldShuffleOfHorizOp(SDNode *N) { +static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) @@ -33611,17 +34042,36 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { HOp.getOperand(0) != HOp.getOperand(1)) return SDValue(); + // The shuffle that we are eliminating may have allowed the horizontal op to + // have an undemanded (undefined) operand. Duplicate the other (defined) + // operand to ensure that the results are defined across all lanes without the + // shuffle. + auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) { + SDValue X; + if (HorizOp.getOperand(0).isUndef()) { + assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"); + X = HorizOp.getOperand(1); + } else if (HorizOp.getOperand(1).isUndef()) { + assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"); + X = HorizOp.getOperand(0); + } else { + return HorizOp; + } + return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp), + HorizOp.getValueType(), X, X); + }; + // When the operands of a horizontal math op are identical, the low half of // the result is the same as the high half. If a target shuffle is also - // replicating low and high halves, we don't need the shuffle. + // replicating low and high halves (and without changing the type/length of + // the vector), we don't need the shuffle. if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { - if (HOp.getScalarValueSizeInBits() == 64) { + if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) { // movddup (hadd X, X) --> hadd X, X // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X assert((HOp.getValueType() == MVT::v2f64 || - HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && - "Unexpected type for h-op"); - return HOp; + HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op"); + return updateHOp(HOp, DAG); } return SDValue(); } @@ -33635,14 +34085,14 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { (isTargetShuffleEquivalent(Mask, {0, 0}) || isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) || isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}))) - return HOp; + return updateHOp(HOp, DAG); if (HOp.getValueSizeInBits() == 256 && (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) || isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) || isTargetShuffleEquivalent( Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11}))) - return HOp; + return updateHOp(HOp, DAG); return SDValue(); } @@ -33677,7 +34127,7 @@ static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { // the wide shuffle that we started with. return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), Shuf->getOperand(1), HalfMask, HalfIdx1, - HalfIdx2, false, DAG); + HalfIdx2, false, DAG, /*UseConcat*/true); } static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, @@ -33696,70 +34146,10 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; - if (SDValue HAddSub = foldShuffleOfHorizOp(N)) + if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG)) return HAddSub; } - // During Type Legalization, when promoting illegal vector types, - // the backend might introduce new shuffle dag nodes and bitcasts. - // - // This code performs the following transformation: - // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> - // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) - // - // We do this only if both the bitcast and the BINOP dag nodes have - // one use. Also, perform this transformation only if the new binary - // operation is legal. This is to avoid introducing dag nodes that - // potentially need to be further expanded (or custom lowered) into a - // less optimal sequence of dag nodes. - if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && - N->getOpcode() == ISD::VECTOR_SHUFFLE && - N->getOperand(0).getOpcode() == ISD::BITCAST && - N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - SDValue BC0 = N0.getOperand(0); - EVT SVT = BC0.getValueType(); - unsigned Opcode = BC0.getOpcode(); - unsigned NumElts = VT.getVectorNumElements(); - - if (BC0.hasOneUse() && SVT.isVector() && - SVT.getVectorNumElements() * 2 == NumElts && - TLI.isOperationLegal(Opcode, VT)) { - bool CanFold = false; - switch (Opcode) { - default : break; - case ISD::ADD: - case ISD::SUB: - case ISD::MUL: - // isOperationLegal lies for integer ops on floating point types. - CanFold = VT.isInteger(); - break; - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - // isOperationLegal lies for floating point ops on integer types. - CanFold = VT.isFloatingPoint(); - break; - } - - unsigned SVTNumElts = SVT.getVectorNumElements(); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) - CanFold = SVOp->getMaskElt(i) == (int)(i * 2); - for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) - CanFold = SVOp->getMaskElt(i) < 0; - - if (CanFold) { - SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); - SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); - SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); - return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); - } - } - } - // Attempt to combine into a vector load/broadcast. if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) return LD; @@ -33841,7 +34231,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && ISD::isNormalLoad(N->getOperand(0).getNode())) { LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - if (!LN->isVolatile()) { + if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue VZLoad = @@ -33855,53 +34245,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } - - // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the - // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. - // FIXME: This can probably go away once we default to widening legalization. - if (Subtarget.hasSSE41() && VT == MVT::v4i32 && - N->getOpcode() == ISD::VECTOR_SHUFFLE && - N->getOperand(0).getOpcode() == ISD::BITCAST && - N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) { - SDValue BC = N->getOperand(0); - SDValue MULUDQ = BC.getOperand(0); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - ArrayRef<int> Mask = SVOp->getMask(); - if (BC.hasOneUse() && MULUDQ.hasOneUse() && - Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) { - SDValue Op0 = MULUDQ.getOperand(0); - SDValue Op1 = MULUDQ.getOperand(1); - if (Op0.getOpcode() == ISD::BITCAST && - Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && - Op0.getOperand(0).getValueType() == MVT::v4i32) { - ShuffleVectorSDNode *SVOp0 = - cast<ShuffleVectorSDNode>(Op0.getOperand(0)); - ArrayRef<int> Mask2 = SVOp0->getMask(); - if (Mask2[0] == 0 && Mask2[1] == -1 && - Mask2[2] == 1 && Mask2[3] == -1) { - Op0 = SVOp0->getOperand(0); - Op1 = DAG.getBitcast(MVT::v4i32, Op1); - Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask); - return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); - } - } - if (Op1.getOpcode() == ISD::BITCAST && - Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && - Op1.getOperand(0).getValueType() == MVT::v4i32) { - ShuffleVectorSDNode *SVOp1 = - cast<ShuffleVectorSDNode>(Op1.getOperand(0)); - ArrayRef<int> Mask2 = SVOp1->getMask(); - if (Mask2[0] == 0 && Mask2[1] == -1 && - Mask2[2] == 1 && Mask2[3] == -1) { - Op0 = DAG.getBitcast(MVT::v4i32, Op0); - Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask); - Op1 = SVOp1->getOperand(0); - return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); - } - } - } - } - return SDValue(); } @@ -33966,6 +34309,84 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // TODO convert SrcUndef to KnownUndef. break; } + case X86ISD::KSHIFTL: { + SDValue Src = Op.getOperand(0); + auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); + assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"); + unsigned ShiftAmt = Amt->getZExtValue(); + + if (ShiftAmt == 0) + return TLO.CombineTo(Op, Src); + + // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a + // single shift. We can do this if the bottom bits (which are shifted + // out) are never demanded. + if (Src.getOpcode() == X86ISD::KSHIFTR) { + if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) { + unsigned C1 = Src.getConstantOperandVal(1); + unsigned NewOpc = X86ISD::KSHIFTL; + int Diff = ShiftAmt - C1; + if (Diff < 0) { + Diff = -Diff; + NewOpc = X86ISD::KSHIFTR; + } + + SDLoc dl(Op); + SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); + return TLO.CombineTo( + Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); + } + } + + APInt DemandedSrc = DemandedElts.lshr(ShiftAmt); + if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, + Depth + 1)) + return true; + + KnownUndef <<= ShiftAmt; + KnownZero <<= ShiftAmt; + KnownZero.setLowBits(ShiftAmt); + break; + } + case X86ISD::KSHIFTR: { + SDValue Src = Op.getOperand(0); + auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); + assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"); + unsigned ShiftAmt = Amt->getZExtValue(); + + if (ShiftAmt == 0) + return TLO.CombineTo(Op, Src); + + // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a + // single shift. We can do this if the top bits (which are shifted + // out) are never demanded. + if (Src.getOpcode() == X86ISD::KSHIFTL) { + if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) { + unsigned C1 = Src.getConstantOperandVal(1); + unsigned NewOpc = X86ISD::KSHIFTR; + int Diff = ShiftAmt - C1; + if (Diff < 0) { + Diff = -Diff; + NewOpc = X86ISD::KSHIFTL; + } + + SDLoc dl(Op); + SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); + return TLO.CombineTo( + Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); + } + } + + APInt DemandedSrc = DemandedElts.shl(ShiftAmt); + if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, + Depth + 1)) + return true; + + KnownUndef.lshrInPlace(ShiftAmt); + KnownZero.lshrInPlace(ShiftAmt); + KnownZero.setHighBits(ShiftAmt); + break; + } case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: { SDValue Src = Op.getOperand(0); @@ -33979,16 +34400,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } case X86ISD::PACKSS: case X86ISD::PACKUS: { + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt SrcUndef, SrcZero; - if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef, - SrcZero, TLO, Depth + 1)) + if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO, + Depth + 1)) return true; - if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef, - SrcZero, TLO, Depth + 1)) + if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO, + Depth + 1)) return true; + + // Aggressively peek through ops to get at the demanded elts. + // TODO - we should do this for all target/faux shuffles ops. + if (!DemandedElts.isAllOnesValue()) { + APInt DemandedSrcBits = + APInt::getAllOnesValue(N0.getScalarValueSizeInBits()); + SDValue NewN0 = SimplifyMultipleUseDemandedBits( + N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1); + SDValue NewN1 = SimplifyMultipleUseDemandedBits( + N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1); + if (NewN0 || NewN1) { + NewN0 = NewN0 ? NewN0 : N0; + NewN1 = NewN1 ? NewN1 : N1; + return TLO.CombineTo(Op, + TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1)); + } + } break; } case X86ISD::HADD: @@ -34062,25 +34503,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } - case X86ISD::SUBV_BROADCAST: { - // Reduce size of broadcast if we don't need the upper half. - unsigned HalfElts = NumElts / 2; - if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) { - SDValue Src = Op.getOperand(0); - MVT SrcVT = Src.getSimpleValueType(); - - SDValue Half = Src; - if (SrcVT.getVectorNumElements() != HalfElts) { - MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts); - Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src); - } - - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0, - TLO.DAG, SDLoc(Op), - Half.getValueSizeInBits())); - } - break; - } case X86ISD::VPERMV: { SDValue Mask = Op.getOperand(0); APInt MaskUndef, MaskZero; @@ -34135,6 +34557,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } + // Subvector broadcast. + case X86ISD::SUBV_BROADCAST: { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + if (Src.getValueSizeInBits() > ExtSizeInBits) + Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); + else if (Src.getValueSizeInBits() < ExtSizeInBits) { + MVT SrcSVT = Src.getSimpleValueType().getScalarType(); + MVT SrcVT = + MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits()); + Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src); + } + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, + TLO.DAG, DL, ExtSizeInBits)); + } // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -34201,36 +34638,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } } - // Simplify target shuffles. - if (!isTargetShuffle(Opc) || !VT.isSimple()) - return false; - - // Get target shuffle mask. - bool IsUnary; + // Get target/faux shuffle mask. + APInt OpUndef, OpZero; SmallVector<int, 64> OpMask; SmallVector<SDValue, 2> OpInputs; - if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs, - OpMask, IsUnary)) + if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef, + OpZero, TLO.DAG, Depth, false)) return false; - // Shuffle inputs must be the same type as the result. - if (llvm::any_of(OpInputs, - [VT](SDValue V) { return VT != V.getValueType(); })) + // Shuffle inputs must be the same size as the result. + if (OpMask.size() != (unsigned)NumElts || + llvm::any_of(OpInputs, [VT](SDValue V) { + return VT.getSizeInBits() != V.getValueSizeInBits() || + !V.getValueType().isVector(); + })) return false; - // Clear known elts that might have been set above. - KnownZero.clearAllBits(); - KnownUndef.clearAllBits(); + KnownZero = OpZero; + KnownUndef = OpUndef; // Check if shuffle mask can be simplified to undef/zero/identity. int NumSrcs = OpInputs.size(); - for (int i = 0; i != NumElts; ++i) { - int &M = OpMask[i]; + for (int i = 0; i != NumElts; ++i) if (!DemandedElts[i]) - M = SM_SentinelUndef; - else if (0 <= M && OpInputs[M / NumElts].isUndef()) - M = SM_SentinelUndef; - } + OpMask[i] = SM_SentinelUndef; if (isUndefInRange(OpMask, 0, NumElts)) { KnownUndef.setAllBits(); @@ -34243,10 +34674,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } for (int Src = 0; Src != NumSrcs; ++Src) if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) - return TLO.CombineTo(Op, OpInputs[Src]); + return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src])); // Attempt to simplify inputs. for (int Src = 0; Src != NumSrcs; ++Src) { + // TODO: Support inputs of different types. + if (OpInputs[Src].getValueType() != VT) + continue; + int Lo = Src * NumElts; APInt SrcElts = APInt::getNullValue(NumElts); for (int i = 0; i != NumElts; ++i) @@ -34256,21 +34691,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SrcElts.setBit(M); } + // TODO - Propagate input undef/zero elts. APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; } - // Extract known zero/undef elements. - // TODO - Propagate input undef/zero elts. - for (int i = 0; i != NumElts; ++i) { - if (OpMask[i] == SM_SentinelUndef) - KnownUndef.setBit(i); - if (OpMask[i] == SM_SentinelZero) - KnownZero.setBit(i); - } - return false; } @@ -34296,6 +34723,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, TLO, Depth + 1)) return true; + + // Aggressively peek through ops to get at the demanded low bits. + SDValue DemandedLHS = SimplifyMultipleUseDemandedBits( + LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + SDValue DemandedRHS = SimplifyMultipleUseDemandedBits( + RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + if (DemandedLHS || DemandedRHS) { + DemandedLHS = DemandedLHS ? DemandedLHS : LHS; + DemandedRHS = DemandedRHS ? DemandedRHS : RHS; + return TLO.CombineTo( + Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS)); + } break; } case X86ISD::VSHLI: { @@ -34323,7 +34762,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; SDValue NewShift = TLO.DAG.getNode( NewOpc, SDLoc(Op), VT, Op0.getOperand(0), - TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); + TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); return TLO.CombineTo(Op, NewShift); } } @@ -34441,6 +34880,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( KnownVec, TLO, Depth + 1)) return true; + if (SDValue V = SimplifyMultipleUseDemandedBits( + Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo( + Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1))); + Known = KnownVec.zext(BitWidth, true); return false; } @@ -34542,12 +34986,80 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } +SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + SelectionDAG &DAG, unsigned Depth) const { + int NumElts = DemandedElts.getBitWidth(); + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + + switch (Opc) { + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + // If we don't demand the inserted element, return the base vector. + SDValue Vec = Op.getOperand(0); + auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + MVT VecVT = Vec.getSimpleValueType(); + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) && + !DemandedElts[CIdx->getZExtValue()]) + return Vec; + break; + } + } + + APInt ShuffleUndef, ShuffleZero; + SmallVector<int, 16> ShuffleMask; + SmallVector<SDValue, 2> ShuffleOps; + if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask, + ShuffleUndef, ShuffleZero, DAG, Depth, false)) { + // If all the demanded elts are from one operand and are inline, + // then we can use the operand directly. + int NumOps = ShuffleOps.size(); + if (ShuffleMask.size() == (unsigned)NumElts && + llvm::all_of(ShuffleOps, [VT](SDValue V) { + return VT.getSizeInBits() == V.getValueSizeInBits(); + })) { + + if (DemandedElts.isSubsetOf(ShuffleUndef)) + return DAG.getUNDEF(VT); + if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero)) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op)); + + // Bitmask that indicates which ops have only been accessed 'inline'. + APInt IdentityOp = APInt::getAllOnesValue(NumOps); + for (int i = 0; i != NumElts; ++i) { + int M = ShuffleMask[i]; + if (!DemandedElts[i] || ShuffleUndef[i]) + continue; + int Op = M / NumElts; + int Index = M % NumElts; + if (M < 0 || Index != i) { + IdentityOp.clearAllBits(); + break; + } + IdentityOp &= APInt::getOneBitSet(NumOps, Op); + if (IdentityOp == 0) + break; + } + assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) && + "Multiple identity shuffles detected"); + + if (IdentityOp != 0) + return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]); + } + } + + return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( + Op, DemandedBits, DemandedElts, DAG, Depth); +} + /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. -static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -34559,13 +35071,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT OriginalVT = InVec.getValueType(); + unsigned NumOriginalElts = OriginalVT.getVectorNumElements(); // Peek through bitcasts, don't duplicate a load with other uses. InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); - if (!CurrentVT.isVector() || - CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + if (!CurrentVT.isVector()) + return SDValue(); + + unsigned NumCurrentElts = CurrentVT.getVectorNumElements(); + if ((NumOriginalElts % NumCurrentElts) != 0) return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) @@ -34582,10 +35098,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); + unsigned Scale = NumOriginalElts / NumCurrentElts; + if (Scale > 1) { + SmallVector<int, 16> ScaledMask; + scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask); + ShuffleMask = std::move(ScaledMask); + } + assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch"); + // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt]; if (Idx == SM_SentinelZero) return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) @@ -34598,8 +35121,9 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) return SDValue(); - assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1]; + assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) && + "Shuffle index out of range"); + SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = @@ -34619,7 +35143,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); - if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) + if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple()) return SDValue(); // If there's a bitcast before the shuffle, check if the load type and @@ -34637,10 +35161,11 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; - Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle, - ShuffleMask); - Shuffle = DAG.getBitcast(OriginalVT, Shuffle); + SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT) + : DAG.getBitcast(OriginalVT, ShuffleOps[1]); + Shuffle = DAG.getVectorShuffle(OriginalVT, dl, + DAG.getBitcast(OriginalVT, ShuffleOps[0]), + Shuffle, ShuffleMask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } @@ -34660,6 +35185,23 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { return false; } +// Helper to push sign extension of vXi1 SETCC result through bitops. +static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, + SDValue Src, const SDLoc &DL) { + switch (Src.getOpcode()) { + case ISD::SETCC: + return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + case ISD::AND: + case ISD::XOR: + case ISD::OR: + return DAG.getNode( + Src.getOpcode(), DL, SExtVT, + signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL), + signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL)); + } + llvm_unreachable("Unexpected node type for vXi1 sign extension"); +} + // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> @@ -34698,6 +35240,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; + bool PropagateSExt = false; switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); @@ -34708,8 +35251,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) + if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) { SExtVT = MVT::v4i64; + PropagateSExt = true; + } break; case MVT::v8i1: SExtVT = MVT::v8i16; @@ -34718,11 +35263,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - // TODO : use checkBitcastSrcVectorSize - if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (Src.getOperand(0).getValueType().is256BitVector() || - Src.getOperand(0).getValueType().is512BitVector())) { + if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) || + checkBitcastSrcVectorSize(Src, 512))) { SExtVT = MVT::v8i32; + PropagateSExt = true; } break; case MVT::v16i1: @@ -34745,19 +35289,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, return SDValue(); }; - SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) + : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); - if (SExtVT == MVT::v64i8) { - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(V, DL); - Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); - Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); - Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); - Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); - Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, - DAG.getConstant(32, DL, MVT::i8)); - V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); - } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) { + if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) { V = getPMOVMSKB(DL, V, DAG, Subtarget); } else { if (SExtVT == MVT::v8i16) @@ -34891,8 +35426,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned ShufMask = (NumElts > 2 ? 0 : 0x44); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, - DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat, - DAG.getConstant(ShufMask, DL, MVT::i8)); + DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), + Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8)); } Ops.append(NumElts, Splat); } else { @@ -34935,6 +35470,24 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; + // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type + // legalization destroys the v4i32 type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 && + VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC && + N0.getOperand(0).getValueType() == MVT::v4i32 && + ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && + cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) { + SDValue N00 = N0.getOperand(0); + // Only do this if we can avoid scalarizing the input. + if (ISD::isNormalLoad(N00.getNode()) || + (N00.getOpcode() == ISD::BITCAST && + N00.getOperand(0).getValueType() == MVT::v4f32)) { + SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, + DAG.getBitcast(MVT::v4f32, N00)); + return DAG.getZExtOrTrunc(V, dl, VT); + } + } + // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && @@ -34949,6 +35502,26 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { + // Use zeros for the widening if we already have some zeroes. This can + // allow SimplifyDemandedBits to remove scalar ANDs that may be down + // stream of this. + // FIXME: It might make sense to detect a concat_vectors with a mix of + // zeroes and undef and turn it into insert_subvector for i1 vectors as + // a separate combine. What we can't do is canonicalize the operands of + // such a concat or we'll get into a loop with SimplifyDemandedBits. + if (N0.getOpcode() == ISD::CONCAT_VECTORS) { + SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1); + if (ISD::isBuildVectorAllZeros(LastOp.getNode())) { + SrcVT = LastOp.getValueType(); + unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); + SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end()); + Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT)); + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); + N0 = DAG.getBitcast(MVT::i8, N0); + return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); + } + } + unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; @@ -34958,6 +35531,33 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, } } + // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and + // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur + // due to insert_subvector legalization on KNL. By promoting the copy to i16 + // we can help with known bits propagation from the vXi1 domain to the + // scalar domain. + if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() && + !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == MVT::v16i1 && + isNullConstant(N0.getOperand(1))) + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, + DAG.getBitcast(MVT::i16, N0.getOperand(0))); + + // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT + // determines // the number of bits loaded. Remaining bits are zero. + if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && + VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) { + auto *BCast = cast<MemIntrinsicSDNode>(N0); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + VT.getVectorElementType(), + BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return ResNode; + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -35152,7 +35752,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. ISD::NodeType BinOp; SDValue Src = DAG.matchBinOpReduction( - Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); + Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true); if (!Src) return SDValue(); @@ -35246,29 +35846,31 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, SDLoc DL(Extract); EVT MatchVT = Match.getValueType(); unsigned NumElts = MatchVT.getVectorNumElements(); + unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ExtractVT == MVT::i1) { // Special case for (pre-legalization) vXi1 reductions. - if (NumElts > 32) + if (NumElts > 64 || !isPowerOf2_32(NumElts)) return SDValue(); - if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) { + if (TLI.isTypeLegal(MatchVT)) { // If this is a legal AVX512 predicate type then we can just bitcast. EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { // Use combineBitcastvxi1 to create the MOVMSK. - if (NumElts == 32 && !Subtarget.hasInt256()) { + while (NumElts > MaxElts) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); - NumElts = 16; + NumElts /= 2; } EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); } if (!Movmsk) return SDValue(); - Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32); + Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); } else { // Bail with AVX512VL (which uses predicate registers). if (Subtarget.hasVLX()) @@ -35309,13 +35911,15 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); NumElts = MaskSrcVT.getVectorNumElements(); } - assert(NumElts <= 32 && "Not expecting more than 32 elements"); + assert((NumElts <= 32 || NumElts == 64) && + "Not expecting more than 64 elements"); + MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32; if (BinOp == ISD::XOR) { // parity -> (AND (CTPOP(MOVMSK X)), 1) - SDValue Mask = DAG.getConstant(1, DL, MVT::i32); - SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk); - Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask); + SDValue Mask = DAG.getConstant(1, DL, CmpVT); + SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk); + Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask); return DAG.getZExtOrTrunc(Result, DL, ExtractVT); } @@ -35323,19 +35927,19 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 - CmpC = DAG.getConstant(0, DL, MVT::i32); + CmpC = DAG.getConstant(0, DL, CmpVT); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) - CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32); + CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts), + DL, CmpVT); CondCode = ISD::CondCode::SETEQ; } // The setcc produces an i8 of 0/1, so extend that to the result width and // negate to get the final 0/-1 mask value. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetccVT = - TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT); SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); SDValue Zero = DAG.getConstant(0, DL, ExtractVT); @@ -35431,6 +36035,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDLoc dl(N); SDValue Src = N->getOperand(0); SDValue Idx = N->getOperand(1); @@ -35452,10 +36057,37 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, SrcOp); } + // If we're extracting a single element from a broadcast load and there are + // no other users, just create a single load. + if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC); + unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); + if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && + VT.getSizeInBits() == SrcBCWidth) { + SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getPointerInfo(), + MemIntr->getAlignment(), + MemIntr->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Load; + } + } + + // Handle extract(truncate(x)) for 0'th index. + // TODO: Treat this as a faux shuffle? + // TODO: When can we use this for general indices? + if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && + isNullConstant(Idx)) { + Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); + Src = DAG.getBitcast(SrcVT, Src); + return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx); + } + // Resolve the target shuffle inputs and mask. SmallVector<int, 16> Mask; SmallVector<SDValue, 2> Ops; - if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) + if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -35489,7 +36121,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); int SrcIdx = Mask[N->getConstantOperandVal(1)]; - SDLoc dl(N); // If the shuffle source element is undef/zero then we can just accept it. if (SrcIdx == SM_SentinelUndef) @@ -35584,7 +36215,7 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { } // TODO: This switch could include FNEG and the x86-specific FP logic ops - // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid // missed load folding and fma+fneg combining. switch (Vec.getOpcode()) { case ISD::FMA: // Begin 3 operands @@ -35631,27 +36262,84 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); - if (!Subtarget.hasFastHorizontalOps() && !OptForSize) - return SDValue(); - SDValue Index = ExtElt->getOperand(1); - if (!isNullConstant(Index)) - return SDValue(); - // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. ISD::NodeType Opc; - SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); + SDValue Rdx = + DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true); if (!Rdx) return SDValue(); + SDValue Index = ExtElt->getOperand(1); + assert(isNullConstant(Index) && + "Reduction doesn't end in an extract from index 0"); + EVT VT = ExtElt->getValueType(0); - EVT VecVT = ExtElt->getOperand(0).getValueType(); + EVT VecVT = Rdx.getValueType(); if (VecVT.getScalarType() != VT) return SDValue(); - unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; SDLoc DL(ExtElt); + // vXi8 reduction - sub 128-bit vector. + if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) { + if (VecVT == MVT::v4i8) { + // Pad with zero. + if (Subtarget.hasSSE41()) { + Rdx = DAG.getBitcast(MVT::i32, Rdx); + Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, + DAG.getConstant(0, DL, MVT::v4i32), Rdx, + DAG.getIntPtrConstant(0, DL)); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + } else { + Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx, + DAG.getConstant(0, DL, VecVT)); + } + } + if (Rdx.getValueType() == MVT::v8i8) { + // Pad with undef. + Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, + DAG.getUNDEF(MVT::v8i8)); + } + Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, + DAG.getConstant(0, DL, MVT::v16i8)); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); + } + + // Must be a >=128-bit vector with pow2 elements. + if ((VecVT.getSizeInBits() % 128) != 0 || + !isPowerOf2_32(VecVT.getVectorNumElements())) + return SDValue(); + + // vXi8 reduction - sum lo/hi halves then use PSADBW. + if (VT == MVT::i8) { + while (Rdx.getValueSizeInBits() > 128) { + unsigned HalfSize = VecVT.getSizeInBits() / 2; + unsigned HalfElts = VecVT.getVectorNumElements() / 2; + SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize); + SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize); + Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi); + VecVT = Rdx.getValueType(); + } + assert(VecVT == MVT::v16i8 && "v16i8 reduction expected"); + + SDValue Hi = DAG.getVectorShuffle( + MVT::v16i8, DL, Rdx, Rdx, + {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); + Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi); + Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, + getZeroVector(MVT::v16i8, Subtarget, DAG, DL)); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); + } + + // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + return SDValue(); + + unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; + // 256-bit horizontal instructions operate on 128-bit chunks rather than // across the whole vector, so we need an extract + hop preliminary stage. // This is the only step where the operands of the hop are not the same value. @@ -35661,15 +36349,14 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, unsigned NumElts = VecVT.getVectorNumElements(); SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); - VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); - Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); + Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo); + VecVT = Rdx.getValueType(); } if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) return SDValue(); // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 - assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); for (unsigned i = 0; i != ReductionSteps; ++i) Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); @@ -35714,15 +36401,26 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } } - // TODO - Remove this once we can handle the implicit zero-extension of - // X86ISD::PEXTRW/X86ISD::PEXTRB in: - // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and - // combineBasicSADPattern. if (IsPextr) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits( SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI)) return SDValue(N, 0); + + // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling). + if ((InputVector.getOpcode() == X86ISD::PINSRB || + InputVector.getOpcode() == X86ISD::PINSRW) && + InputVector.getOperand(2) == EltIdx) { + assert(SrcVT == InputVector.getOperand(0).getValueType() && + "Vector type mismatch"); + SDValue Scl = InputVector.getOperand(1); + Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl); + return DAG.getZExtOrTrunc(Scl, dl, VT); + } + + // TODO - Remove this once we can handle the implicit zero-extension of + // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad, + // combineHorizontalPredicateResult and combineBasicSADPattern. return SDValue(); } @@ -35832,6 +36530,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, // get simplified at node creation time)? bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + // If both inputs are 0/undef, create a complete zero vector. + // FIXME: As noted above this should be handled by DAGCombiner/getNode. + if (TValIsAllZeros && FValIsAllZeros) { + if (VT.isFloatingPoint()) + return DAG.getConstantFP(0.0, DL, VT); + return DAG.getConstant(0, DL, VT); + } + if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) @@ -36295,8 +37002,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && - (ExperimentalVectorWideningLegalization || - VT.getVectorNumElements() > 4) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); @@ -36358,6 +37063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // subl %esi, $edi // cmovsl %eax, %edi if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + Cond.hasOneUse() && DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -36508,6 +37214,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) return V; + // select(~Cond, X, Y) -> select(Cond, Y, X) + if (CondVT.getScalarType() != MVT::i1) + if (SDValue CondNot = IsNOT(Cond, DAG)) + return DAG.getNode(N->getOpcode(), DL, VT, + DAG.getBitcast(CondVT, CondNot), RHS, LHS); + // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); @@ -36873,8 +37585,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { - SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), - Flags}; + SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8), + Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } @@ -36923,12 +37635,13 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { - uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); - if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); + assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && + "Implicit constant truncation"); bool isFastMultiplier = false; - if (Diff < 10) { - switch ((unsigned char)Diff) { + if (Diff.ult(10)) { + switch (Diff.getZExtValue()) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) @@ -36943,7 +37656,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, } if (isFastMultiplier) { - APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = getSETCC(CC, Cond, DL ,DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), @@ -36994,8 +37706,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { - SDValue Ops[] = { FalseOp, Cond.getOperand(0), - DAG.getConstant(CC, DL, MVT::i8), Cond }; + SDValue Ops[] = {FalseOp, Cond.getOperand(0), + DAG.getTargetConstant(CC, DL, MVT::i8), Cond}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } @@ -37029,10 +37741,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, CC1 = X86::GetOppositeBranchCondition(CC1); } - SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), - Flags}; + SDValue LOps[] = {FalseOp, TrueOp, + DAG.getTargetConstant(CC0, DL, MVT::i8), Flags}; SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); - SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; + SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8), + Flags}; SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); return CMOV; } @@ -37064,9 +37777,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // This should constant fold. SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); - SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), - DAG.getConstant(X86::COND_NE, DL, MVT::i8), - Cond); + SDValue CMov = + DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), + DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond); return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); } } @@ -37166,98 +37879,45 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, if ((NumElts % 2) != 0) return SDValue(); - unsigned RegSize = 128; - MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - if (ExperimentalVectorWideningLegalization || - NumElts >= OpsVT.getVectorNumElements()) { - // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the - // lower part is needed. - SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); - if (Mode == MULU8 || Mode == MULS8) - return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, - DL, VT, MulLo); - - MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); - // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, - // the higher part is also needed. - SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - ReducedVT, NewN0, NewN1); - - // Repack the lower part and higher part result of mul into a wider - // result. - // Generate shuffle functioning as punpcklwd. - SmallVector<int, 16> ShuffleMask(NumElts); - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i; - ShuffleMask[2 * i + 1] = i + NumElts; - } - SDValue ResLo = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResLo = DAG.getBitcast(ResVT, ResLo); - // Generate shuffle functioning as punpckhwd. - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i + NumElts / 2; - ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; - } - SDValue ResHi = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResHi = DAG.getBitcast(ResVT, ResHi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); - } - - // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want - // to legalize the mul explicitly because implicit legalization for type - // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack - // instructions which will not exist when we explicitly legalize it by - // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with - // <4 x i16> undef). - // - // Legalize the operands of mul. - // FIXME: We may be able to handle non-concatenated vectors by insertion. - unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); - if ((RegSize % ReducedSizeInBits) != 0) - return SDValue(); - - SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits, - DAG.getUNDEF(ReducedVT)); - Ops[0] = NewN0; - NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); - Ops[0] = NewN1; - NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); - - if (Mode == MULU8 || Mode == MULS8) { - // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower - // part is needed. - SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); - - // convert the type of mul result to VT. - MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); - SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG, - DL, ResVT, Mul); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); - } + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); - // Generate the lower and higher part of mul: pmulhw/pmulhuw. For - // MULU16/MULS16, both parts are needed. - SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - OpsVT, NewN0, NewN1); + ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider - // result. Make sure the type of mul result is VT. - MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); - SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); - Res = DAG.getBitcast(ResVT, Res); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector<int, 16> ShuffleMask(NumElts); + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + NumElts; + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResLo = DAG.getBitcast(ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i + NumElts / 2; + ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResHi = DAG.getBitcast(ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, @@ -37365,8 +38025,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || - DAG.getTargetLoweringInfo().isTypeLegal(WVT))) + if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT)) return SDValue(); SDValue N0 = N->getOperand(0); @@ -37919,7 +38578,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, if (NewShiftVal >= NumBitsPerElt) NewShiftVal = NumBitsPerElt - 1; return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0), - DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8)); + DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8)); } // We can decode 'whole byte' logical bit shifts as shuffles. @@ -38039,7 +38698,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); + DAG.getTargetConstant(x86cc, DL, MVT::i8)); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, @@ -38048,10 +38707,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL, N->getSimpleValueType(0)); } - SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, - CMP00.getValueType(), CMP00, CMP01, - DAG.getConstant(x86cc, DL, - MVT::i8)); + SDValue OnesOrZeroesF = + DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, + CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; @@ -38083,34 +38741,6 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Match (xor X, -1) -> X. -// Match extract_subvector(xor X, -1) -> extract_subvector(X). -// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). -static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { - V = peekThroughBitcasts(V); - if (V.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) - return V.getOperand(0); - if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && - (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { - if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { - Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), - Not, V.getOperand(1)); - } - } - SmallVector<SDValue, 2> CatOps; - if (collectConcatOps(V.getNode(), CatOps)) { - for (SDValue &CatOp : CatOps) { - SDValue NotCat = IsNOT(CatOp, DAG); - if (!NotCat) return SDValue(); - CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); - } - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); - } - return SDValue(); -} - /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); @@ -38273,7 +38903,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); unsigned ShiftVal = SplatVal.countTrailingOnes(); - SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); + SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } @@ -38499,7 +39129,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector<SDValue, 2> SrcOps; - if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) && + if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) && SrcOps.size() == 1) { SDLoc dl(N); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); @@ -38570,7 +39200,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, } if (SDValue Shuffle = combineX86ShufflesRecursively( - {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2, + {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); @@ -38585,7 +39215,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); - EVT VT = N->getValueType(0); + MVT VT = N->getSimpleValueType(0); if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0) return SDValue(); @@ -38594,10 +39224,12 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) return SDValue(); - // On XOP we'll lower to PCMOV so accept one use, otherwise only - // do this if either mask has multiple uses already. - if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() || - !N1.getOperand(1).hasOneUse())) + // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use + // VPTERNLOG. Otherwise only do this if either mask has multiple uses already. + bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) || + Subtarget.hasVLX(); + if (!(Subtarget.hasXOP() || UseVPTERNLOG || + !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse())) return SDValue(); // Attempt to extract constant byte masks. @@ -38895,6 +39527,24 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, DAG.getBitcast(MVT::v4f32, N1))); } + // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector<SDValue, 2> SrcOps; + if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (Mask) { + APInt AllBits = APInt::getNullValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); + } + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -39136,26 +39786,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } -/// Check if truncation with saturation form type \p SrcVT to \p DstVT -/// is valid for the given \p Subtarget. -static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasAVX512()) - return false; - - // FIXME: Scalar type may be supported if we move it to vector register. - if (!SrcVT.isVector()) - return false; - - EVT SrcElVT = SrcVT.getScalarType(); - EVT DstElVT = DstVT.getScalarType(); - if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32) - return false; - if (SrcVT.is512BitVector() || Subtarget.hasVLX()) - return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); - return false; -} - /// Detect patterns of truncation with unsigned saturation: /// /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). @@ -39253,64 +39883,61 @@ static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) { return SDValue(); } -/// Detect a pattern of truncation with signed saturation. -/// The types should allow to use VPMOVSS* instruction on AVX512. -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. -static SDValue detectAVX512SSatPattern(SDValue In, EVT VT, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { - if (!TLI.isTypeLegal(In.getValueType())) - return SDValue(); - if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) - return SDValue(); - return detectSSatPattern(In, VT); -} - -/// Detect a pattern of truncation with saturation: -/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). -/// The types should allow to use VPMOVUS* instruction on AVX512. -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. -static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG, - const SDLoc &DL, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { - if (!TLI.isTypeLegal(In.getValueType())) - return SDValue(); - if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) - return SDValue(); - return detectUSatPattern(In, VT, DAG, DL); -} - static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT SVT = VT.getScalarType(); + if (!Subtarget.hasSSE2() || !VT.isVector()) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); EVT InVT = In.getValueType(); - EVT InSVT = InVT.getScalarType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) && - isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) { - if (auto SSatVal = detectSSatPattern(In, VT)) - return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); - if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) - return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + EVT InSVT = InVT.getVectorElementType(); + + // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is + // split across two registers. We can use a packusdw+perm to clamp to 0-65535 + // and concatenate at the same time. Then we can use a final vpmovuswb to + // clip to 0-255. + if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + InVT == MVT::v16i32 && VT == MVT::v16i8) { + if (auto USatVal = detectSSatPattern(In, VT, true)) { + // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. + SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, + DL, DAG, Subtarget); + assert(Mid && "Failed to pack!"); + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid); + } } - if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && - !Subtarget.hasAVX512() && + + // vXi32 truncate instructions are available with AVX512F. + // vXi16 truncate instructions are only available with AVX512BW. + // For 256-bit or smaller vectors, we require VLX. + // FIXME: We could widen truncates to 512 to remove the VLX restriction. + // If the result type is 256-bits or larger and we have disable 512-bit + // registers, we should go ahead and use the pack instructions if possible. + bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || + (Subtarget.hasBWI() && InSVT == MVT::i16)) && + (InVT.getSizeInBits() > 128) && + (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && + !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); + + if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 && + VT.getSizeInBits() >= 64 && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (auto USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). + // Only do this when the result is at least 64 bits or we'll leaving + // dangling PACKSSDW nodes. if (SVT == MVT::i8 && InSVT == MVT::i32) { EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, DAG, Subtarget); - if (Mid) - return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, - Subtarget); + assert(Mid && "Failed to pack!"); + SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, + Subtarget); + assert(V && "Failed to pack!"); + return V; } else if (SVT == MVT::i8 || Subtarget.hasSSE41()) return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, Subtarget); @@ -39319,6 +39946,42 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, Subtarget); } + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && + Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) { + unsigned TruncOpc; + SDValue SatVal; + if (auto SSatVal = detectSSatPattern(In, VT)) { + SatVal = SSatVal; + TruncOpc = X86ISD::VTRUNCS; + } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) { + SatVal = USatVal; + TruncOpc = X86ISD::VTRUNCUS; + } + if (SatVal) { + unsigned ResElts = VT.getVectorNumElements(); + // If the input type is less than 512 bits and we don't have VLX, we need + // to widen to 512 bits. + if (!Subtarget.hasVLX() && !InVT.is512BitVector()) { + unsigned NumConcats = 512 / InVT.getSizeInBits(); + ResElts *= NumConcats; + SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT)); + ConcatOps[0] = SatVal; + InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, + NumConcats * InVT.getVectorNumElements()); + SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps); + } + // Widen the result if its narrower than 128 bits. + if (ResElts * SVT.getSizeInBits() < 128) + ResElts = 128 / SVT.getSizeInBits(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts); + SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + } + return SDValue(); } @@ -39377,7 +40040,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return true; }; - // Check if each element of the vector is left-shifted by one. + // Check if each element of the vector is right-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); if (!IsConstVectorInRange(RHS, 1, 1)) @@ -39679,90 +40342,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::EXTLOAD) - return SDValue(); - - // Resolve extending loads. - EVT VT = Mld->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - EVT LdVT = Mld->getMemoryVT(); - SDLoc dl(Mld); - - assert(LdVT != VT && "Cannot extend to the same type"); - unsigned ToSz = VT.getScalarSizeInBits(); - unsigned FromSz = LdVT.getScalarSizeInBits(); - // From/To sizes and ElemCount must be pow of two. - assert (isPowerOf2_32(NumElems * FromSz * ToSz) && - "Unexpected size for extending masked load"); - - unsigned SizeRatio = ToSz / FromSz; - assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), - LdVT.getScalarType(), NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - // Convert PassThru value. - SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru()); - if (!Mld->getPassThru().isUndef()) { - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - - // Can't shuffle using an illegal type. - assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && - "WideVecVT should be legal"); - WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru, - DAG.getUNDEF(WideVecVT), ShuffleVec); - } - - // Prepare the new mask. - SDValue NewMask; - SDValue Mask = Mld->getMask(); - if (Mask.getValueType() == VT) { - // Mask and original value have the same type. - NewMask = DAG.getBitcast(WideVecVT, Mask); - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) - ShuffleVec[i] = NumElems * SizeRatio; - NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, - DAG.getConstant(0, dl, WideVecVT), - ShuffleVec); - } else { - assert(Mask.getValueType().getVectorElementType() == MVT::i1); - unsigned WidenNumElts = NumElems*SizeRatio; - unsigned MaskNumElts = VT.getVectorNumElements(); - EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - WidenNumElts); - - unsigned NumConcat = WidenNumElts / MaskNumElts; - SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); - SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); - Ops[0] = Mask; - NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); - } - - SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), - Mld->getBasePtr(), NewMask, WidePassThru, - Mld->getMemoryVT(), Mld->getMemOperand(), - ISD::NON_EXTLOAD); - - SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio] = i; - - // Can't shuffle using an illegal type. - assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && - "WideVecVT should be legal"); - SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), ShuffleVec); - SlicedVec = DAG.getBitcast(VT, SlicedVec); - - return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); + return SDValue(); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -39800,123 +40380,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); - EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!Mst->isTruncatingStore()) { - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) - return ScalarStore; - - // If the mask value has been legalized to a non-boolean vector, try to - // simplify ops leading up to it. We only demand the MSB of each lane. - SDValue Mask = Mst->getMask(); - if (Mask.getScalarValueSizeInBits() != 1) { - APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) - return SDValue(N, 0); - } - - // TODO: AVX512 targets should also be able to simplify something like the - // pattern above, but that pattern will be different. It will either need to - // match setcc more generally or match PCMPGTM later (in tablegen?). - - SDValue Value = Mst->getValue(); - if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && - TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), - Mst->getMemoryVT())) { - return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); - } - - return SDValue(); - } - - // Resolve truncating stores. - unsigned NumElems = VT.getVectorNumElements(); - - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromSz = VT.getScalarSizeInBits(); - unsigned ToSz = StVT.getScalarSizeInBits(); - - // The truncating store is legal in some cases. For example - // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw - // are designated for truncate store. - // In this case we don't need any further transformations. - if (TLI.isTruncStoreLegal(VT, StVT)) + if (Mst->isTruncatingStore()) return SDValue(); - // From/To sizes and ElemCount must be pow of two. - assert (isPowerOf2_32(NumElems * FromSz * ToSz) && - "Unexpected size for truncating masked store"); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - assert (((NumElems * FromSz) % ToSz) == 0 && - "Unexpected ratio for truncating masked store"); - - unsigned SizeRatio = FromSz / ToSz; - assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + return ScalarStore; - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), - StVT.getScalarType(), NumElems*SizeRatio); - - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - - // Can't shuffle using an illegal type. - assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && - "WideVecVT should be legal"); - - SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - ShuffleVec); - - SDValue NewMask; + // If the mask value has been legalized to a non-boolean vector, try to + // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); - if (Mask.getValueType() == VT) { - // Mask and original value have the same type. - NewMask = DAG.getBitcast(WideVecVT, Mask); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; - NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, - DAG.getConstant(0, dl, WideVecVT), - ShuffleVec); - } else { - assert(Mask.getValueType().getVectorElementType() == MVT::i1); - unsigned WidenNumElts = NumElems*SizeRatio; - unsigned MaskNumElts = VT.getVectorNumElements(); - EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - WidenNumElts); + if (Mask.getScalarValueSizeInBits() != 1) { + APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + return SDValue(N, 0); + } - unsigned NumConcat = WidenNumElts / MaskNumElts; - SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); - SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); - Ops[0] = Mask; - NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, - Mst->getBasePtr(), NewMask, StVT, - Mst->getMemOperand(), false); + return SDValue(); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast<StoreSDNode>(N); - EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); unsigned Alignment = St->getAlignment(); - SDValue StoredVal = St->getOperand(1); + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Convert a store of vXi1 into a store of iX and a bitcast. @@ -39986,8 +40488,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); } - // If we are saving a concatenation of two XMM registers and 32-byte stores - // are slow, such as on Sandy Bridge, perform two 16-byte stores. + // If we are saving a 32-byte vector and 32-byte stores are slow, such as on + // Sandy Bridge, perform two 16-byte stores. bool Fast; if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, @@ -40026,13 +40528,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && St->getValue().getOpcode() == ISD::TRUNCATE && St->getValue().getOperand(0).getValueType() == MVT::v16i16 && - TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) && - !DCI.isBeforeLegalizeOps()) { + TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && + St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } + // Try to fold a VTRUNCUS or VTRUNCS into a truncating store. + if (!St->isTruncatingStore() && StoredVal.hasOneUse() && + (StoredVal.getOpcode() == X86ISD::VTRUNCUS || + StoredVal.getOpcode() == X86ISD::VTRUNCS) && + TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) { + bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS; + return EmitTruncSStore(IsSigned, St->getChain(), + dl, StoredVal.getOperand(0), St->getBasePtr(), + VT, St->getMemOperand(), DAG); + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. @@ -40040,100 +40553,26 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Check if we can detect an AVG pattern from the truncation. If yes, // replace the trunc store by a normal store with the result of X86ISD::AVG // instruction. - if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, - Subtarget, dl)) - return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); - - if (SDValue Val = - detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget, - TLI)) - return EmitTruncSStore(true /* Signed saturation */, St->getChain(), - dl, Val, St->getBasePtr(), - St->getMemoryVT(), St->getMemOperand(), DAG); - if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), - DAG, dl, Subtarget, TLI)) - return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), - dl, Val, St->getBasePtr(), - St->getMemoryVT(), St->getMemOperand(), DAG); - - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromSz = VT.getScalarSizeInBits(); - unsigned ToSz = StVT.getScalarSizeInBits(); - - // The truncating store is legal in some cases. For example - // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw - // are designated for truncate store. - // In this case we don't need any further transformations. - if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) - return SDValue(); - - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromSz) % ToSz) return SDValue(); - - unsigned SizeRatio = FromSz / ToSz; - - assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), - StVT.getScalarType(), NumElems*SizeRatio); - - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; + if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT())) + if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, + Subtarget, dl)) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) - return SDValue(); - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - ShuffleVec); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. - - // Find the largest store unit - MVT StoreType = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) - StoreType = Tp; - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && - (64 <= NumElems * ToSz)) - StoreType = MVT::f64; - - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); - SmallVector<SDValue, 8> Chains; - SDValue Ptr = St->getBasePtr(); - - // Perform one or more big stores into memory. - for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - StoreType, ShuffWide, - DAG.getIntPtrConstant(i, dl)); - SDValue Ch = - DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), - St->getAlignment(), St->getMemOperand()->getFlags()); - Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); - Chains.push_back(Ch); + if (TLI.isTruncStoreLegal(VT, StVT)) { + if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT())) + return EmitTruncSStore(true /* Signed saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); + if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(), + DAG, dl)) + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + return SDValue(); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -40149,11 +40588,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if (((VT.isVector() && !VT.isFloatingPoint()) || - (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && + if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) && isa<LoadSDNode>(St->getValue()) && - !cast<LoadSDNode>(St->getValue())->isVolatile() && - St->getChain().hasOneUse() && !St->isVolatile()) { + cast<LoadSDNode>(St->getValue())->isSimple() && + St->getChain().hasOneUse() && St->isSimple()) { LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode()); SmallVector<SDValue, 8> Ops; @@ -40595,8 +41033,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Requires SSE2 but AVX512 has fast truncate. - if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + // Requires SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) @@ -40620,6 +41058,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); + // AVX512 has fast truncate, but if the input is already going to be split, + // there's no harm in trying pack. + if (Subtarget.hasAVX512() && + !(!Subtarget.useAVX512Regs() && VT.is256BitVector() && + InVT.is512BitVector())) + return SDValue(); + unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; @@ -40658,9 +41103,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // Only handle vXi16 types that are at least 128-bits unless they will be // widened. - if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 || - (!ExperimentalVectorWideningLegalization && - VT.getVectorNumElements() < 8)) + if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) return SDValue(); // Input type should be vXi32. @@ -40874,6 +41317,19 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return combineVectorTruncation(N, DAG, Subtarget); } +static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + SDLoc DL(N); + + if (auto SSatVal = detectSSatPattern(In, VT)) + return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); + if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + + return SDValue(); +} + /// Returns the negated value if the node \p N flips sign of FP value. /// /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000) @@ -40883,10 +41339,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, /// In this case we go though all bitcasts. /// This also recognizes splat of a negated value and returns the splat of that /// value. -static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { +static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); + // Don't recurse exponentially. + if (Depth > SelectionDAG::MaxRecursionDepth) + return SDValue(); + unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); SDValue Op = peekThroughBitcasts(SDValue(N, 0)); @@ -40900,7 +41360,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!SVOp->getOperand(1).isUndef()) return SDValue(); - if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode())) + if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1)) if (NegOp0.getValueType() == VT) // FIXME: Can we do better? return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), SVOp->getMask()); @@ -40914,7 +41374,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { SDValue InsVal = Op.getOperand(1); if (!InsVector.isUndef()) return SDValue(); - if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode())) + if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1)) if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, NegInsVal, Op.getOperand(2)); @@ -40951,6 +41411,57 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { return SDValue(); } +static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, + bool NegRes) { + if (NegMul) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::FMA: Opcode = X86ISD::FNMADD; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMADD: Opcode = ISD::FMA; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; + } + } + + if (NegAcc) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::FMA: Opcode = X86ISD::FMSUB; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; + case X86ISD::FMSUB: Opcode = ISD::FMA; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; + case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; + case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; + case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; + } + } + + if (NegRes) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::FMA: Opcode = X86ISD::FNMSUB; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMSUB: Opcode = ISD::FMA; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break; + } + } + + return Opcode; +} + /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -40980,29 +41491,123 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, // If we're negating an FMA node, then we can adjust the // instruction to include the extra negation. - unsigned NewOpcode = 0; if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) { switch (Arg.getOpcode()) { - case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; - case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break; - case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; - case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; - case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; - // We can't handle scalar intrinsic node here because it would only - // invert one element and not the whole vector. But we could try to handle - // a negation of the lower element only. + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + // We can't handle scalar intrinsic node here because it would only + // invert one element and not the whole vector. But we could try to handle + // a negation of the lower element only. + unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true); + return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops())); + } } } - if (NewOpcode) - return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, - Arg.getNode()->ops())); return SDValue(); } +char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg patterns are removable even if they have multiple uses. + if (isFNEG(DAG, Op.getNode(), Depth)) + return 2; + + // Don't recurse exponentially. + if (Depth > SelectionDAG::MaxRecursionDepth) + return 0; + + EVT VT = Op.getValueType(); + EVT SVT = VT.getScalarType(); + switch (Op.getOpcode()) { + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || + !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + break; + + // This is always negatible for free but we might be able to remove some + // extra operand negations as well. + for (int i = 0; i != 3; ++i) { + char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V == 2) + return V; + } + return 1; + } + } + + return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + +SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg patterns are removable even if they have multiple uses. + if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) + return DAG.getBitcast(Op.getValueType(), Arg); + + EVT VT = Op.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || + !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + break; + + // This is always negatible for free but we might be able to remove some + // extra operand negations as well. + SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue()); + for (int i = 0; i != 3; ++i) { + char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V == 2) + NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + } + + bool NegA = !!NewOps[0]; + bool NegB = !!NewOps[1]; + bool NegC = !!NewOps[2]; + unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true); + + // Fill in the non-negated ops with the original values. + for (int i = 0, e = Op.getNumOperands(); i != e; ++i) + if (!NewOps[i]) + NewOps[i] = Op.getOperand(i); + return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps); + } + } + + return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); @@ -41312,8 +41917,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - // Unless the load is volatile. - if (!LN->isVolatile()) { + // Unless the load is volatile or atomic. + if (LN->isSimple()) { SDLoc dl(N); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getIntegerVT(NumBits); @@ -41347,8 +41952,8 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - // Unless the load is volatile. - if (!LN->isVolatile()) { + // Unless the load is volatile or atomic. + if (LN->isSimple()) { SDLoc dl(N); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getFloatingPointVT(NumBits); @@ -41724,127 +42329,6 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } -/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or -/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating -/// with UNDEFs) of the input to vectors of the same size as the target type -/// which then extends the lowest elements. -static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (ExperimentalVectorWideningLegalization) - return SDValue(); - - unsigned Opcode = N->getOpcode(); - // TODO - add ANY_EXTEND support. - if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) - return SDValue(); - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - if (!Subtarget.hasSSE2()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - EVT VT = N->getValueType(0); - EVT SVT = VT.getScalarType(); - EVT InVT = N0.getValueType(); - EVT InSVT = InVT.getScalarType(); - - // FIXME: Generic DAGCombiner previously had a bug that would cause a - // sign_extend of setcc to sometimes return the original node and tricked it - // into thinking CombineTo was used which prevented the target combines from - // running. - // Earlying out here to avoid regressions like this - // (v4i32 (sext (v4i1 (setcc (v4i16))))) - // Becomes - // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef)))) - // Type legalized to - // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32))))))) - // Leading to a packssdw+pmovsxwd - // We could write a DAG combine to fix this, but really we shouldn't be - // creating sext_invec that's forcing v8i16 into the DAG. - if (N0.getOpcode() == ISD::SETCC) - return SDValue(); - - // Input type must be a vector and we must be extending legal integer types. - if (!VT.isVector() || VT.getVectorNumElements() < 2) - return SDValue(); - if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) - return SDValue(); - if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) - return SDValue(); - - // If the input/output types are both legal then we have at least AVX1 and - // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly. - if (DAG.getTargetLoweringInfo().isTypeLegal(VT) && - DAG.getTargetLoweringInfo().isTypeLegal(InVT)) - return SDValue(); - - SDLoc DL(N); - - auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { - EVT SrcVT = N.getValueType(); - EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), - Size / SrcVT.getScalarSizeInBits()); - SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(), - DAG.getUNDEF(SrcVT)); - Opnds[0] = N; - return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); - }; - - // If target-size is less than 128-bits, extend to a type that would extend - // to 128 bits, extend that and extract the original target vector. - if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { - unsigned Scale = 128 / VT.getSizeInBits(); - EVT ExVT = - EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); - SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); - SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, - DAG.getIntPtrConstant(0, DL)); - } - - // If target-size is 128-bits (or 256-bits on AVX target), then convert to - // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. - // Also use this if we don't have SSE41 to allow the legalizer do its job. - if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasAVX()) || - (VT.is512BitVector() && Subtarget.useAVX512Regs())) { - SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); - Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); - return DAG.getNode(Opcode, DL, VT, ExOp); - } - - auto SplitAndExtendInReg = [&](unsigned SplitSize) { - unsigned NumVecs = VT.getSizeInBits() / SplitSize; - unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); - EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); - EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); - - unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); - SmallVector<SDValue, 8> Opnds; - for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { - SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, - DAG.getIntPtrConstant(Offset, DL)); - SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); - SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec); - Opnds.push_back(SrcVec); - } - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); - }; - - // On pre-AVX targets, split into 128-bit nodes of - // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) - return SplitAndExtendInReg(128); - - // On pre-AVX512 targets, split into 256-bit nodes of - // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) - return SplitAndExtendInReg(256); - - return SDValue(); -} - // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, @@ -41915,9 +42399,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); } - if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) - return V; - if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -41931,45 +42412,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { - if (NegMul) { - switch (Opcode) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FNMADD; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; - case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMADD: Opcode = ISD::FMA; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; - } - } - - if (NegAcc) { - switch (Opcode) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FMSUB; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; - case X86ISD::FMSUB: Opcode = ISD::FMA; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; - } - } - - return Opcode; -} - static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); @@ -41980,17 +42431,21 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); - auto invertIfNegative = [&DAG](SDValue &V) { - if (SDValue NegVal = isFNEG(DAG, V.getNode())) { - V = DAG.getBitcast(V.getValueType(), NegVal); + auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); + if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) { + V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize); return true; } // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(V.getOperand(1))) { - if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) { - NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal); + SDValue Vec = V.getOperand(0); + if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) { + SDValue NegVal = + TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize); V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), NegVal, V.getOperand(1)); return true; @@ -42009,7 +42464,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, if (!NegA && !NegB && !NegC) return SDValue(); - unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC); + unsigned NewOpcode = + negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); @@ -42017,33 +42473,27 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) +// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C) static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDLoc dl(N); EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); - SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode()); - if (!NegVal) - return SDValue(); - - // FIXME: Should we bitcast instead? - if (NegVal.getValueType() != VT) + SDValue N2 = N->getOperand(2); + if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2) return SDValue(); - unsigned NewOpcode; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected opcode!"); - case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break; - case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break; - case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break; - case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break; - } + SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize); + unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), - NegVal, N->getOperand(3)); + NegN2, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), - NegVal); + NegN2); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -42090,9 +42540,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; - if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) - return V; - if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -42111,12 +42558,11 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); - unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { - return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); + return concatSubVectors(N00, N01, DAG, dl); } } @@ -42159,16 +42605,30 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, !IsOrXorXorCCZero) return SDValue(); - // TODO: Use PXOR + PTEST for SSE4.1 or later? EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); + bool HasAVX = Subtarget.hasAVX(); + + // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512. + // Otherwise use PCMPEQ (plus AND) and mask testing. if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX2()) || + (OpSize == 256 && HasAVX) || (OpSize == 512 && Subtarget.useAVX512Regs())) { - EVT VecVT = OpSize == 512 ? MVT::v16i32 : - OpSize == 256 ? MVT::v32i8 : - MVT::v16i8; - EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; + bool HasPT = Subtarget.hasSSE41(); + EVT VecVT = MVT::v16i8; + EVT CmpVT = MVT::v16i8; + if (OpSize == 256) + VecVT = CmpVT = MVT::v32i8; + if (OpSize == 512) { + if (Subtarget.hasBWI()) { + VecVT = MVT::v64i8; + CmpVT = MVT::v64i1; + } else { + VecVT = MVT::v16i32; + CmpVT = MVT::v16i1; + } + } + SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -42179,18 +42639,38 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); + if (VecVT == CmpVT && HasPT) { + SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B); + SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D); + Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2); + } else { + SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); + Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); + } } else { SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); - Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); + if (VecVT == CmpVT && HasPT) { + Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); + } else { + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); + } } // For 512-bits we want to emit a setcc that will lower to kortest. - if (OpSize == 512) - return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), - DAG.getConstant(0xFFFF, DL, MVT::i16), CC); + if (VecVT != CmpVT) { + EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16; + SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT); + return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC); + } + if (HasPT) { + SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, + Cmp); + SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp); + X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; + SDValue SetCC = getSETCC(X86CC, PT, DL, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0)); + } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne @@ -42270,8 +42750,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, // go through type promotion to a 128-bit vector. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && - (ExperimentalVectorWideningLegalization || - VT.getVectorNumElements() > 4) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, @@ -42289,7 +42767,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, } static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); @@ -42310,7 +42789,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, // Look through int->fp bitcasts that don't change the element width. unsigned EltWidth = SrcVT.getScalarSizeInBits(); - if (Src.getOpcode() == ISD::BITCAST && + if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST && Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); @@ -42334,71 +42813,123 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // With vector masks we only demand the upper bit of the mask. + SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask(); + if (Mask.getScalarValueSizeInBits() != 1) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + return SDValue(N, 0); + } + + return SDValue(); +} + static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); + auto *GorS = cast<MaskedGatherScatterSDNode>(N); + SDValue Chain = GorS->getChain(); + SDValue Index = GorS->getIndex(); + SDValue Mask = GorS->getMask(); + SDValue Base = GorS->getBasePtr(); + SDValue Scale = GorS->getScale(); - if (DCI.isBeforeLegalizeOps()) { - SDValue Index = N->getOperand(4); - // Remove any sign extends from 32 or smaller to larger than 32. - // Only do this before LegalizeOps in case we need the sign extend for - // legalization. - if (Index.getOpcode() == ISD::SIGN_EXTEND) { - if (Index.getScalarValueSizeInBits() > 32 && - Index.getOperand(0).getScalarValueSizeInBits() <= 32) { - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); - SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); - if (Res == N) { - // The original sign extend has less users, add back to worklist in - // case it needs to be removed - DCI.AddToWorklist(Index.getNode()); - DCI.AddToWorklist(N); + if (DCI.isBeforeLegalize()) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); + + // Shrink constant indices if they are larger than 32-bits. + // Only do this before legalize types since v2i64 could become v2i32. + // FIXME: We could check that the type is legal if we're after legalize + // types, but then we would need to construct test cases where that happens. + // FIXME: We could support more than just constant vectors, but we need to + // careful with costing. A truncate that can be optimized out would be fine. + // Otherwise we might only want to create a truncate if it avoids a split. + if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) { + if (BV->isConstant() && IndexWidth > 32 && + DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + unsigned NumElts = Index.getValueType().getVectorNumElements(); + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); } - return SDValue(Res, 0); + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); + } + } + + // Shrink any sign/zero extends from 32 or smaller to larger than 32 if + // there are sufficient sign bits. Only do this before legalize types to + // avoid creating illegal types in truncate. + if ((Index.getOpcode() == ISD::SIGN_EXTEND || + Index.getOpcode() == ISD::ZERO_EXTEND) && + IndexWidth > 32 && + Index.getOperand(0).getScalarValueSizeInBits() <= 32 && + DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + unsigned NumElts = Index.getValueType().getVectorNumElements(); + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); } + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); } + } + + if (DCI.isBeforeLegalizeOps()) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); // Make sure the index is either i32 or i64 - unsigned ScalarSize = Index.getScalarValueSizeInBits(); - if (ScalarSize != 32 && ScalarSize != 64) { - MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; + if (IndexWidth != 32 && IndexWidth != 64) { + MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index; - SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); - if (Res == N) - DCI.AddToWorklist(N); - return SDValue(Res, 0); - } - - // Try to remove zero extends from 32->64 if we know the sign bit of - // the input is zero. - if (Index.getOpcode() == ISD::ZERO_EXTEND && - Index.getScalarValueSizeInBits() == 64 && - Index.getOperand(0).getScalarValueSizeInBits() == 32) { - if (DAG.SignBitIsZero(Index.getOperand(0))) { - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); - SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); - if (Res == N) { - // The original sign extend has less users, add back to worklist in - // case it needs to be removed - DCI.AddToWorklist(Index.getNode()); - DCI.AddToWorklist(N); - } - return SDValue(Res, 0); + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); } + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); } } - // With AVX2 we only demand the upper bit of the mask. - if (!Subtarget.hasAVX512()) { + // With vector masks we only demand the upper bit of the mask. + if (Mask.getScalarValueSizeInBits() != 1) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Mask = N->getOperand(2); APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); @@ -42432,7 +42963,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { - SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); + SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); } @@ -42549,6 +43080,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, } static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. @@ -42578,13 +43110,22 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, unsigned BitWidth = InVT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); if (NumSignBits >= (BitWidth - 31)) { - EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32); + EVT TruncVT = MVT::i32; if (InVT.isVector()) TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, InVT.getVectorNumElements()); SDLoc dl(N); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); - return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); + if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) { + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); + } + // If we're after legalize and the type is v2i32 we need to shuffle and + // use CVTSI2P. + assert(InVT == MVT::v2i64 && "Unexpected VT!"); + SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); + SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, + { 0, 2, -1, -1 }); + return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf); } } @@ -42604,7 +43145,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasDQI() && VT != MVT::f80) return SDValue(); - if (!Ld->isVolatile() && !VT.isVector() && + if (Ld->isSimple() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( @@ -42841,12 +43382,12 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); - SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, - MVT::i8), - N->getOperand(2)), - DAG.getConstant(1, DL, VT)); + SDValue Res1 = + DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + N->getOperand(2)), + DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } @@ -42906,7 +43447,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Y.getOperand(1)); } @@ -42924,7 +43465,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), NewEFLAGS); } } @@ -42984,7 +43525,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); } @@ -42997,7 +43538,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { SDValue One = DAG.getConstant(1, DL, ZVT); SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1); + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1); } } @@ -43025,9 +43566,6 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - EVT VT = N->getValueType(0); // If the vector size is less than 128, or greater than the supported RegSize, @@ -43035,14 +43573,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, if (!VT.isVector() || VT.getVectorNumElements() < 8) return SDValue(); - if (Op0.getOpcode() != ISD::MUL) - std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::MUL) - return SDValue(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); - ShrinkMode Mode; - if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16) - return SDValue(); + auto UsePMADDWD = [&](SDValue Op) { + ShrinkMode Mode; + return Op.getOpcode() == ISD::MUL && + canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 && + (!Subtarget.hasSSE41() || + (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + Op->isOnlyUserOf(Op.getOperand(1).getNode()))); + }; + + SDValue MulOp, OtherOp; + if (UsePMADDWD(Op0)) { + MulOp = Op0; + OtherOp = Op1; + } else if (UsePMADDWD(Op1)) { + MulOp = Op1; + OtherOp = Op0; + } else + return SDValue(); SDLoc DL(N); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, @@ -43050,34 +43601,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, VT.getVectorNumElements() / 2); + // Shrink the operands of mul. + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); + // Madd vector size is half of the original vector size auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; + SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, + PMADDWDBuilder); + // Fill the rest of the output with 0 + SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType()); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); - auto BuildPMADDWD = [&](SDValue Mul) { - // Shrink the operands of mul. - SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0)); - SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1)); - - SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, - PMADDWDBuilder); - // Fill the rest of the output with 0 - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, - DAG.getConstant(0, DL, MAddVT)); - }; - - Op0 = BuildPMADDWD(Op0); - - // It's possible that Op1 is also a mul we can reduce. - if (Op1.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) { - Op1 = BuildPMADDWD(Op1); - } - - return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); + // Preserve the reduction flag on the ADD. We may need to revisit for the + // other operand. + SDNodeFlags Flags; + Flags.setVectorReduction(true); + return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, @@ -43087,8 +43631,6 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); EVT VT = N->getValueType(0); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); // TODO: There's nothing special about i32, any integer type above i16 should // work just as well. @@ -43108,80 +43650,53 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, if (VT.getSizeInBits() / 4 > RegSize) return SDValue(); - // We know N is a reduction add, which means one of its operands is a phi. - // To match SAD, we need the other operand to be a ABS. - if (Op0.getOpcode() != ISD::ABS) - std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::ABS) + // We know N is a reduction add. To match SAD, we need one of the operands to + // be an ABS. + SDValue AbsOp = N->getOperand(0); + SDValue OtherOp = N->getOperand(1); + if (AbsOp.getOpcode() != ISD::ABS) + std::swap(AbsOp, OtherOp); + if (AbsOp.getOpcode() != ISD::ABS) return SDValue(); - auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) { - // SAD pattern detected. Now build a SAD instruction and an addition for - // reduction. Note that the number of elements of the result of SAD is less - // than the number of elements of its input. Therefore, we could only update - // part of elements in the reduction vector. - SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget); - - // The output of PSADBW is a vector of i64. - // We need to turn the vector of i64 into a vector of i32. - // If the reduction vector is at least as wide as the psadbw result, just - // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero - // anyway. - MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); - if (VT.getSizeInBits() >= ResVT.getSizeInBits()) - Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); - else - Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); - - if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Fill the upper elements with zero to match the add width. - SDValue Zero = DAG.getConstant(0, DL, VT); - Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad, - DAG.getIntPtrConstant(0, DL)); - } - - return Sad; - }; - // Check whether we have an abs-diff pattern feeding into the select. SDValue SadOp0, SadOp1; - if (!detectZextAbsDiff(Op0, SadOp0, SadOp1)) + if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1)) return SDValue(); - Op0 = BuildPSADBW(SadOp0, SadOp1); - - // It's possible we have a sad on the other side too. - if (Op1.getOpcode() == ISD::ABS && - detectZextAbsDiff(Op1, SadOp0, SadOp1)) { - Op1 = BuildPSADBW(SadOp0, SadOp1); - } - - return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); -} - -/// Convert vector increment or decrement to sub/add with an all-ones constant: -/// add X, <1, 1...> --> sub X, <-1, -1...> -/// sub X, <1, 1...> --> add X, <-1, -1...> -/// The all-ones vector constant can be materialized using a pcmpeq instruction -/// that is commonly recognized as an idiom (has no register dependency), so -/// that's better/smaller than loading a splat 1 constant. -static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { - assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && - "Unexpected opcode for increment/decrement transform"); + // SAD pattern detected. Now build a SAD instruction and an addition for + // reduction. Note that the number of elements of the result of SAD is less + // than the number of elements of its input. Therefore, we could only update + // part of elements in the reduction vector. + SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget); - // Pseudo-legality check: getOnesVector() expects one of these types, so bail - // out and wait for legalization if we have an unsupported vector length. - EVT VT = N->getValueType(0); - if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) - return SDValue(); + // The output of PSADBW is a vector of i64. + // We need to turn the vector of i64 into a vector of i32. + // If the reduction vector is at least as wide as the psadbw result, just + // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of + // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64 + // result to v2i32 which will be removed by type legalization. If we/ widen + // narrow vectors then we bitcast to v4i32 and extract v2i32. + MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); + Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); - APInt SplatVal; - if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue()) - return SDValue(); + if (VT.getSizeInBits() > ResVT.getSizeInBits()) { + // Fill the upper elements with zero to match the add width. + assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"); + unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits(); + SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT)); + Ops[0] = Sad; + Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); + } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) { + Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad, + DAG.getIntPtrConstant(0, DL)); + } - SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); - unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; - return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); + // Preserve the reduction flag on the ADD. We may need to revisit for the + // other operand. + SDNodeFlags Flags; + Flags.setVectorReduction(true); + return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags); } static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, @@ -43294,8 +43809,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, } // Attempt to turn this pattern into PMADDWD. -// (mul (add (zext (build_vector)), (zext (build_vector))), -// (add (zext (build_vector)), (zext (build_vector))) +// (mul (add (sext (build_vector)), (sext (build_vector))), +// (add (sext (build_vector)), (sext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -43415,6 +43930,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { @@ -43445,8 +43961,29 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, HADDBuilder); } - if (SDValue V = combineIncDecVector(N, DAG)) - return V; + // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into + // (sub Y, (sext (vXi1 X))). + // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in + // generic DAG combine without a legal type check, but adding this there + // caused regressions. + if (VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (Op0.getOpcode() == ISD::ZERO_EXTEND && + Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && + TLI.isTypeLegal(Op0.getOperand(0).getValueType())) { + SDLoc DL(N); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt); + } + + if (Op1.getOpcode() == ISD::ZERO_EXTEND && + Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && + TLI.isTypeLegal(Op1.getOperand(0).getValueType())) { + SDLoc DL(N); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt); + } + } return combineAddOrSubToADCOrSBB(N, DAG); } @@ -43457,13 +43994,15 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + // PSUBUS is supported, starting from SSE2, but truncation for v8i32 // is only worth it with SSSE3 (PSHUFB). - if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && + EVT EltVT = VT.getVectorElementType(); + if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) && !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) && - !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && - !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || - VT == MVT::v16i32 || VT == MVT::v8i64))) + !(Subtarget.useBWIRegs() && (VT == MVT::v16i32))) return SDValue(); SDValue SubusLHS, SubusRHS; @@ -43493,16 +44032,13 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, } else return SDValue(); - auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef<SDValue> Ops) { - return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); - }; - // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with // special preprocessing in some cases. - if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, - { SubusLHS, SubusRHS }, USUBSATBuilder); + if (EltVT == MVT::i8 || EltVT == MVT::i16) + return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS); + + assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && + "Unexpected VT!"); // Special preprocessing case can be only applied // if the value was zero extended from 16 bit, @@ -43531,15 +44067,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SDValue NewSubusLHS = DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType); SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); - SDValue Psubus = - SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType, - { NewSubusLHS, NewSubusRHS }, USUBSATBuilder); + SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType, + NewSubusLHS, NewSubusRHS); + // Zero extend the result, it may be used somewhere as 32 bit, // if not zext and following trunc will shrink. return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -43576,9 +44113,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, HSUBBuilder); } - if (SDValue V = combineIncDecVector(N, DAG)) - return V; - // Try to create PSUBUS if SUB's argument is max/min if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) return V; @@ -43712,14 +44246,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } - // If we're inserting all zeros into the upper half, change this to - // an insert into an all zeros vector. We will match this to a move - // with implicit upper bit zeroing during isel. - if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode())) - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - getZeroVector(VT, Subtarget, DAG, DL), Ops[0], - DAG.getIntPtrConstant(0, DL)); - return SDValue(); } @@ -43786,10 +44312,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && - SubVec.getConstantOperandAPInt(1) == 0 && + isNullConstant(SubVec.getOperand(1)) && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); - if (Ins.getConstantOperandAPInt(2) == 0 && + if (isNullConstant(Ins.getOperand(2)) && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, @@ -43825,31 +44351,42 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // Match concat_vector style patterns. SmallVector<SDValue, 2> SubVectorOps; - if (collectConcatOps(N, SubVectorOps)) + if (collectConcatOps(N, SubVectorOps)) { if (SDValue Fold = combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) return Fold; - // If we are inserting into both halves of the vector, the starting vector - // should be undef. If it isn't, make it so. Only do this if the early insert - // has no other uses. - // TODO: Should this be a generic DAG combine? - // TODO: Why doesn't SimplifyDemandedVectorElts catch this? - if ((IdxVal == OpVT.getVectorNumElements() / 2) && - Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 && - isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() && - Vec.hasOneUse()) { - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), - Vec.getOperand(1), Vec.getOperand(2)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, - N->getOperand(2)); + // If we're inserting all zeros into the upper half, change this to + // a concat with zero. We will match this to a move + // with implicit upper bit zeroing during isel. + // We do this here because we don't want combineConcatVectorOps to + // create INSERT_SUBVECTOR from CONCAT_VECTORS. + if (SubVectorOps.size() == 2 && + ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, + getZeroVector(OpVT, Subtarget, DAG, dl), + SubVectorOps[0], DAG.getIntPtrConstant(0, dl)); } // If this is a broadcast insert into an upper undef, use a larger broadcast. if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); + // If this is a broadcast load inserted into an upper undef, use a larger + // broadcast load. + if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() && + SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec); + SDVTList Tys = DAG.getVTList(OpVT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + return SDValue(); } @@ -43928,12 +44465,15 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return SDValue(); MVT VT = N->getSimpleValueType(0); - EVT WideVecVT = N->getOperand(0).getValueType(); - SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); + SDValue InVec = N->getOperand(0); + SDValue InVecBC = peekThroughBitcasts(InVec); + EVT InVecVT = InVec.getValueType(); + EVT InVecBCVT = InVecBC.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && - TLI.isTypeLegal(WideVecVT) && - WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) { + TLI.isTypeLegal(InVecVT) && + InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) { auto isConcatenatedNot = [] (SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) @@ -43941,12 +44481,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue NotOp = V->getOperand(0); return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; }; - if (isConcatenatedNot(WideVec.getOperand(0)) || - isConcatenatedNot(WideVec.getOperand(1))) { + if (isConcatenatedNot(InVecBC.getOperand(0)) || + isConcatenatedNot(InVecBC.getOperand(1))) { // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 - SDValue Concat = split256IntArith(WideVec, DAG); + SDValue Concat = split256IntArith(InVecBC, DAG); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - DAG.getBitcast(WideVecVT, Concat), N->getOperand(1)); + DAG.getBitcast(InVecVT, Concat), N->getOperand(1)); } } @@ -43956,7 +44496,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (SDValue V = narrowExtractedVectorSelect(N, DAG)) return V; - SDValue InVec = N->getOperand(0); unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) @@ -43976,31 +44515,42 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR - if (InVec.getOpcode() == ISD::BITCAST && - InVec.getOperand(0).getValueType().isVector()) { - SDValue SrcOp = InVec.getOperand(0); - EVT SrcVT = SrcOp.getValueType(); - unsigned SrcNumElts = SrcVT.getVectorNumElements(); - unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if (InVec != InVecBC && InVecBCVT.isVector()) { + unsigned SrcNumElts = InVecBCVT.getVectorNumElements(); + unsigned DestNumElts = InVecVT.getVectorNumElements(); if ((DestNumElts % SrcNumElts) == 0) { unsigned DestSrcRatio = DestNumElts / SrcNumElts; if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - SrcVT.getScalarType(), NewExtNumElts); + InVecBCVT.getScalarType(), NewExtNumElts); if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; SDLoc DL(N); SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - SrcOp, NewIndex); + InVecBC, NewIndex); return DAG.getBitcast(VT, NewExtract); } } } } + // If we are extracting from an insert into a zero vector, replace with a + // smaller insert into zero if we don't access less than the original + // subvector. Don't do this for i1 vectors. + if (VT.getVectorElementType() != MVT::i1 && + InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && + InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && + ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && + InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) { + SDLoc DL(N); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), + InVec.getOperand(1), InVec.getOperand(2)); + } + // If we're extracting from a broadcast then we're better off just // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. @@ -44008,11 +44558,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); + if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(InVec); + if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + } + // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); - if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { + if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { @@ -44093,7 +44657,8 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { // Simplify PMULDQ and PMULUDQ operations. static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -44103,23 +44668,43 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS); // Multiply by zero. + // Don't return RHS as it may contain UNDEFs. if (ISD::isBuildVectorAllZeros(RHS.getNode())) - return RHS; - - // Aggressively peek through ops to get at the demanded low bits. - APInt DemandedMask = APInt::getLowBitsSet(64, 32); - SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask); - SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask); - if (DemandedLHS || DemandedRHS) - return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), - DemandedLHS ? DemandedLHS : LHS, - DemandedRHS ? DemandedRHS : RHS); + return DAG.getConstant(0, SDLoc(N), N->getValueType(0)); // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); + // If the input is an extend_invec and the SimplifyDemandedBits call didn't + // convert it to any_extend_invec, due to the LegalOperations check, do the + // conversion directly to a vector shuffle manually. This exposes combine + // opportunities missed by combineExtInVec not calling + // combineX86ShufflesRecursively on SSE4.1 targets. + // FIXME: This is basically a hack around several other issues related to + // ANY_EXTEND_VECTOR_INREG. + if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && + LHS.getOperand(0).getValueType() == MVT::v4i32) { + SDLoc dl(N); + LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0), + LHS.getOperand(0), { 0, -1, 1, -1 }); + LHS = DAG.getBitcast(MVT::v2i64, LHS); + return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); + } + if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() && + (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && + RHS.getOperand(0).getValueType() == MVT::v4i32) { + SDLoc dl(N); + RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0), + RHS.getOperand(0), { 0, -1, 1, -1 }); + RHS = DAG.getBitcast(MVT::v2i64, RHS); + return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); + } + return SDValue(); } @@ -44134,7 +44719,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { auto *Ld = cast<LoadSDNode>(In); - if (!Ld->isVolatile()) { + if (Ld->isSimple()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, @@ -44150,17 +44735,6 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, } } - // Disabling for widening legalization for now. We can enable if we find a - // case that needs it. Otherwise it can be deleted when we switch to - // widening legalization. - if (ExperimentalVectorWideningLegalization) - return SDValue(); - - // Combine (ext_invec (ext_invec X)) -> (ext_invec X) - if (In.getOpcode() == N->getOpcode() && - TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); - // Attempt to combine as a shuffle. // TODO: SSE41 support if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { @@ -44173,6 +44747,20 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -44196,8 +44784,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case X86ISD::CMP: return combineCMP(N, DAG); - case ISD::ADD: return combineAdd(N, DAG, Subtarget); - case ISD::SUB: return combineSub(N, DAG, Subtarget); + case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget); + case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); case X86ISD::SBB: return combineSBB(N, DAG); @@ -44214,12 +44802,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); - case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); @@ -44299,20 +44888,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: case X86ISD::FNMSUB_RND: - case ISD::FMA: return combineFMA(N, DAG, Subtarget); + case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: - case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); - case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI); + case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI); + case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget); case X86ISD::MGATHER: - case X86ISD::MSCATTER: + case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI); case ISD::MGATHER: - case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); + case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: - case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI); + case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); + case X86ISD::KSHIFTL: + case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); } return SDValue(); @@ -44660,10 +45251,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { case 'I': case 'J': case 'K': - case 'L': - case 'M': case 'N': case 'G': + case 'L': + case 'M': + return C_Immediate; case 'C': case 'e': case 'Z': @@ -45175,8 +45767,9 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); - // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. - // Vector types. + // TODO: Handle i128 in FR128RegClass after it is tested well. + // Vector types and fp128. + case MVT::f128: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: @@ -45469,7 +46062,7 @@ void X86TargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be @@ -45514,3 +46107,16 @@ X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } + +unsigned +X86TargetLowering::getStackProbeSize(MachineFunction &MF) const { + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackProbeSize; +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index e0be03bc3f9d..6f7e90008de4 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -17,7 +17,6 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" -#include "llvm/Target/TargetOptions.h" namespace llvm { class X86Subtarget; @@ -144,6 +143,10 @@ namespace llvm { /// relative displacements. WrapperRIP, + /// Copies a 64-bit value from an MMX vector to the low word + /// of an XMM vector, with the high word zero filled. + MOVQ2DQ, + /// Copies a 64-bit value from the low word of an XMM vector /// to an MMX vector. MOVDQ2Q, @@ -422,7 +425,8 @@ namespace llvm { // Tests Types Of a FP Values for scalar types. VFPCLASSS, - // Broadcast scalar to vector. + // Broadcast (splat) scalar or element 0 of a vector. If the operand is + // a vector, this node may change the vector length as part of the splat. VBROADCAST, // Broadcast mask to vector. VBROADCASTM, @@ -611,6 +615,9 @@ namespace llvm { // extract_vector_elt, store. VEXTRACT_STORE, + // scalar broadcast from memory + VBROADCAST_LOAD, + // Store FP control world into i16 memory. FNSTCW16m, @@ -680,6 +687,9 @@ namespace llvm { bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO); + /// If Op is a constant whose elements are all the same constant or + /// undefined, return true and return the constant value in \p SplatVal. + bool isConstantSplat(SDValue Op, APInt &SplatVal); } // end namespace X86 //===--------------------------------------------------------------------===// @@ -792,6 +802,17 @@ namespace llvm { /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + /// Return 1 if we can compute the negated form of the specified expression + /// for the same cost as the expression itself, or 2 if we can compute the + /// negated form more cheaply than the expression itself. Else return 0. + char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations, + bool ForCodeSize, unsigned Depth) const override; + + /// If isNegatibleForFree returns true, return the newly negated expression. + SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + unsigned Depth) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -840,6 +861,13 @@ namespace llvm { bool hasAndNot(SDValue Y) const override; + bool hasBitTest(SDValue X, SDValue Y) const override; + + bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; @@ -863,11 +891,7 @@ namespace llvm { return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } - bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return false; - return true; - } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; bool shouldSplatInsEltVarIndex(EVT VT) const override; @@ -913,6 +937,10 @@ namespace llvm { TargetLoweringOpt &TLO, unsigned Depth) const override; + SDValue SimplifyMultipleUseDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + SelectionDAG &DAG, unsigned Depth) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; SDValue unwrapAddress(SDValue N) const override; @@ -1090,11 +1118,12 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override; + bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override; bool convertSelectOfConstantsToMath(EVT VT) const override; - bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const override; @@ -1136,8 +1165,8 @@ namespace llvm { return nullptr; // nothing to do, move along. } - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. @@ -1189,12 +1218,18 @@ namespace llvm { CallingConv::ID CC, EVT VT) const override; + unsigned getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool supportSwiftError() const override; StringRef getStackProbeSymbolName(MachineFunction &MF) const override; + unsigned getStackProbeSize(MachineFunction &MF) const; + bool hasVectorBlend() const override { return true; } unsigned getMaxSupportedInterleaveFactor() const override { return 4; } @@ -1326,6 +1361,12 @@ namespace llvm { SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1372,6 +1413,9 @@ namespace llvm { LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override; + bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override; + bool needsCmpXchgNb(Type *MemType) const; void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, @@ -1462,6 +1506,9 @@ namespace llvm { /// Reassociate floating point divisions into multiply by reciprocal. unsigned combineRepeatedFPDivisors() const override; + + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const override; }; namespace X86 { @@ -1625,24 +1672,24 @@ namespace llvm { /// mask. This is the reverse process to canWidenShuffleElements, but can /// always succeed. template <typename T> - void scaleShuffleMask(int Scale, ArrayRef<T> Mask, + void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask, SmallVectorImpl<T> &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); size_t NumElts = Mask.size(); ScaledMask.assign(NumElts * Scale, -1); - for (int i = 0; i != (int)NumElts; ++i) { + for (size_t i = 0; i != NumElts; ++i) { int M = Mask[i]; // Repeat sentinel values in every mask element. if (M < 0) { - for (int s = 0; s != Scale; ++s) + for (size_t s = 0; s != Scale; ++s) ScaledMask[(Scale * i) + s] = M; continue; } // Scale mask element and increment across each mask element. - for (int s = 0; s != Scale; ++s) + for (size_t s = 0; s != Scale; ++s) ScaledMask[(Scale * i) + s] = (Scale * M) + s; } } diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp index 04e8b2231fec..cc0f59ab329d 100644 --- a/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -84,7 +84,7 @@ bool X86IndirectBranchTrackingPass::addENDBR( return false; } -bool IsCallReturnTwice(llvm::MachineOperand &MOp) { +static bool IsCallReturnTwice(llvm::MachineOperand &MOp) { if (!MOp.isGlobal()) return false; auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal()); diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp index 02ae73706a34..2b1e3f23efd7 100644 --- a/lib/Target/X86/X86InsertPrefetch.cpp +++ b/lib/Target/X86/X86InsertPrefetch.cpp @@ -79,8 +79,8 @@ ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples, // The prefetch instruction can't take memory operands involving vector // registers. bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) { - unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg(); - unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg(); + Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg(); + Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg(); return (BaseReg == 0 || X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) || X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) && @@ -108,7 +108,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples, Prefetches &Prefetches) const { assert(Prefetches.empty() && "Expected caller passed empty PrefetchInfo vector."); - static const std::pair<const StringRef, unsigned> HintTypes[] = { + static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = { {"_nta_", X86::PREFETCHNTA}, {"_t0_", X86::PREFETCHT0}, {"_t1_", X86::PREFETCHT1}, @@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) { void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired<MachineModuleInfo>(); + AU.addRequired<MachineModuleInfoWrapperPass>(); } bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 54eddeacaa17..9b5de59430a5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -74,6 +74,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName); PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName); ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), !cast<ComplexPattern>("sse_load_f32"), @@ -412,6 +413,14 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; } +let Predicates = [HasAVX512] in { +def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; +} + // Alias instructions that allow VPTERNLOG to be used with a mask to create // a mix of all ones and all zeros elements. This is done this way to force // the same register to be used as input for all three sources. @@ -436,6 +445,19 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", [(set VR256X:$dst, (v8i32 immAllZerosV))]>; } +let Predicates = [HasAVX512] in { +def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>; +} + // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -443,7 +465,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", - [(set FR64X:$dst, fpimm0)]>; + [(set FR64X:$dst, fp64imm0)]>; + def AVX512_FsFLD0F128 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", + [(set VR128X:$dst, fp128imm0)]>; } //===----------------------------------------------------------------------===// @@ -730,14 +754,14 @@ let isCommutable = 1 in def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>; def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))]>, + timm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>, Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; } @@ -1100,75 +1124,104 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr, X86VectorVTInfo MaskInfo, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, - SDPatternOperator UnmaskedOp = X86VBroadcast> { - let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in { - defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo, - (outs MaskInfo.RC:$dst), - (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))), - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>, - T8PD, EVEX, Sched<[SchedRR]>; - let mayLoad = 1 in - defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo, - (outs MaskInfo.RC:$dst), - (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", - (MaskInfo.VT - (bitconvert - (DestInfo.VT (UnmaskedOp - (SrcInfo.ScalarLdFrag addr:$src))))), - (MaskInfo.VT - (bitconvert - (DestInfo.VT (X86VBroadcast - (SrcInfo.ScalarLdFrag addr:$src)))))>, - T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>, - Sched<[SchedRM]>; - } - - def : Pat<(MaskInfo.VT - (bitconvert - (DestInfo.VT (UnmaskedOp - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src))))))), - (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>; - def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + bit IsConvertibleToThreeAddress, + SDPatternOperator UnmaskedOp = X86VBroadcast, + SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> { + let hasSideEffects = 0 in + def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set MaskInfo.RC:$dst, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))], + DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>; + def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), + (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))))), - MaskInfo.RC:$src0)), - (!cast<Instruction>(Name#DestInfo.ZSuffix#mk) - MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>; - def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + MaskInfo.ImmAllZerosV))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>; + let Constraints = "$src0 = $dst" in + def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), + (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, + SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", + "${dst} {${mask}}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + MaskInfo.RC:$src0))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>; + + let hasSideEffects = 0, mayLoad = 1 in + def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set MaskInfo.RC:$dst, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedBcastOp addr:$src)))))], + DestInfo.ExeDomain>, T8PD, EVEX, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + + def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))))), - MaskInfo.ImmAllZerosV)), - (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz) - MaskInfo.KRCWM:$mask, addr:$src)>; + (SrcInfo.BroadcastLdFrag addr:$src)))), + MaskInfo.ImmAllZerosV))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + + let Constraints = "$src0 = $dst", + isConvertibleToThreeAddress = IsConvertibleToThreeAddress in + def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, + SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", + "${dst} {${mask}}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (SrcInfo.BroadcastLdFrag addr:$src)))), + MaskInfo.RC:$src0))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; } // Helper class to force mask and broadcast result to same type. multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name, SchedWrite SchedRR, SchedWrite SchedRM, X86VectorVTInfo DestInfo, - X86VectorVTInfo SrcInfo> : + X86VectorVTInfo SrcInfo, + bit IsConvertibleToThreeAddress> : avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM, - DestInfo, DestInfo, SrcInfo>; + DestInfo, DestInfo, SrcInfo, + IsConvertibleToThreeAddress>; multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in { defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info512, _.info128>, + WriteFShuffle256Ld, _.info512, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512, _.info128>, EVEX_V512; @@ -1176,7 +1229,7 @@ multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr, let Predicates = [HasVLX] in { defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info256, _.info128>, + WriteFShuffle256Ld, _.info256, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256, _.info128>, EVEX_V256; @@ -1187,7 +1240,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in { defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info512, _.info128>, + WriteFShuffle256Ld, _.info512, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512, _.info128>, EVEX_V512; @@ -1195,12 +1248,12 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, let Predicates = [HasVLX] in { defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info256, _.info128>, + WriteFShuffle256Ld, _.info256, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256, _.info128>, EVEX_V256; defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info128, _.info128>, + WriteFShuffle256Ld, _.info128, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128, _.info128>, EVEX_V128; @@ -1284,46 +1337,35 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, X86VBroadcast, GR64, HasAVX512>, VEX_W; -// Provide aliases for broadcast from the same register class that -// automatically does the extract. -multiclass avx512_int_broadcast_rm_lowering<string Name, - X86VectorVTInfo DestInfo, - X86VectorVTInfo SrcInfo, - X86VectorVTInfo ExtInfo> { - def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), - (!cast<Instruction>(Name#DestInfo.ZSuffix#"r") - (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>; -} - multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo _, Predicate prd> { + AVX512VLVectorVTInfo _, Predicate prd, + bit IsConvertibleToThreeAddress> { let Predicates = [prd] in { defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256, - WriteShuffle256Ld, _.info512, _.info128>, - avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>, + WriteShuffle256Ld, _.info512, _.info128, + IsConvertibleToThreeAddress>, EVEX_V512; - // Defined separately to avoid redefinition. - defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>; } let Predicates = [prd, HasVLX] in { defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256, - WriteShuffle256Ld, _.info256, _.info128>, - avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>, + WriteShuffle256Ld, _.info256, _.info128, + IsConvertibleToThreeAddress>, EVEX_V256; defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle, - WriteShuffleXLd, _.info128, _.info128>, + WriteShuffleXLd, _.info128, _.info128, + IsConvertibleToThreeAddress>, EVEX_V128; } } defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", - avx512vl_i8_info, HasBWI>; + avx512vl_i8_info, HasBWI, 0>; defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", - avx512vl_i16_info, HasBWI>; + avx512vl_i16_info, HasBWI, 0>; defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", - avx512vl_i32_info, HasAVX512>; + avx512vl_i32_info, HasAVX512, 1>; defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", - avx512vl_i64_info, HasAVX512>, VEX_W1X; + avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { @@ -1354,6 +1396,10 @@ let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZm addr:$src)>; } let Predicates = [HasVLX] in { @@ -1362,6 +1408,12 @@ let Predicates = [HasVLX] in { (VPBROADCASTQZ128m addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ128m addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1382,6 +1434,12 @@ let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ256m addr:$src)>; } let Predicates = [HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1394,6 +1452,10 @@ let Predicates = [HasBWI] in { def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -1629,12 +1691,12 @@ multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, let Predicates = [HasDQI] in defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256, WriteShuffle256Ld, _Dst.info512, - _Src.info512, _Src.info128, null_frag>, + _Src.info512, _Src.info128, 0, null_frag, null_frag>, EVEX_V512; let Predicates = [HasDQI, HasVLX] in defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256, WriteShuffle256Ld, _Dst.info256, - _Src.info256, _Src.info128, null_frag>, + _Src.info256, _Src.info128, 0, null_frag, null_frag>, EVEX_V256; } @@ -1645,7 +1707,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, let Predicates = [HasDQI, HasVLX] in defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle, WriteShuffleXLd, _Dst.info128, - _Src.info128, _Src.info128, null_frag>, + _Src.info128, _Src.info128, 0, null_frag, null_frag>, EVEX_V128; } @@ -1654,23 +1716,6 @@ defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", avx512vl_f32_info, avx512vl_f64_info>; -let Predicates = [HasVLX] in { -def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))), - (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; -def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))), - (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; -} - -def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), - (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>; -def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), - (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; - -def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), - (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>; -def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), - (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; - //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- @@ -1730,7 +1775,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src2, - IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1807,7 +1852,7 @@ multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _, def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86VPermt2 _.RC:$src2, (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), - (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.BroadcastLdFrag addr:$src3)), (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; @@ -1846,7 +1891,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1947,7 +1992,7 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr, } multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let mayLoad = 1, hasSideEffects = 0 in { + let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in { def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, @@ -2031,9 +2076,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; + timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -2041,9 +2086,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc), + timm:$cc), (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, + timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, @@ -2052,9 +2097,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc), + timm:$cc), (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, + timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; let isCodeGenOnly = 1 in { @@ -2065,7 +2110,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, - imm:$cc))]>, + timm:$cc))]>, EVEX_4V, VEX_LIG, Sched<[sched]>; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), @@ -2074,7 +2119,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2), - imm:$cc))]>, + timm:$cc))]>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2100,94 +2145,82 @@ let Predicates = [HasAVX512] in { SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } -multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86FoldableSchedWrite sched, +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable, hasSideEffects = 0 in def rr : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>, - EVEX_4V, Sched<[sched]>; + []>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rm : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2))))]>, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isCommutable = IsCommutable in + []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable, hasSideEffects = 0 in def rrk : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), - [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, - EVEX_4V, EVEX_K, Sched<[sched]>; + []>, EVEX_4V, EVEX_K, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rmk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), - [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode_su (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2)))))]>, - EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, +multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> { + avx512_icmp_packed<opc, OpcodeStr, sched, _, IsCommutable> { + let mayLoad = 1, hasSideEffects = 0 in { def rmb : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), - [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, - EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), - [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode_su (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))]>, - EVEX_4V, EVEX_K, EVEX_B, + []>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } -multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86SchedWriteWidths sched, +multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, + defm Z : avx512_icmp_packed<opc, OpcodeStr, sched.ZMM, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, sched.YMM, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, sched.XMM, VTInfo.info128, IsCommutable>, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, - PatFrag OpNode, PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.ZMM, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.YMM, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.XMM, VTInfo.info128, IsCommutable>, EVEX_V128; } } @@ -2195,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, // This fragment treats X86cmpm as commutable to help match loads in both // operands for PCMPEQ. def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; -def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), - (X86setcc_commute node:$src1, node:$src2, SETEQ)>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; -def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpeqm_c node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; -def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpgtm node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -2322,8 +2344,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmibk : AVX512AIi8<opc, MRMSrcMem, @@ -2335,23 +2356,21 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, [(set _.KRC:$dst, (and _.KRCWM:$mask, (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast<Instruction>(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag_su:$cc (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast<Instruction>(Name#_.ZSuffix#"rmibk") _.KRCWM:$mask, _.RC:$src1, addr:$src2, - (CommFrag.OperandTransform $cc))>; + (CommFrag_su.OperandTransform $cc))>; } multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag, @@ -2496,14 +2515,19 @@ def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), return N->hasOneUse(); }]>; +def X86cmpm_imm_commute : SDNodeXForm<timm, [{ + uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f); + return getI8Imm(Imm, SDLoc(N)); +}]>; + multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), - (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), 1>, Sched<[sched]>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, @@ -2511,9 +2535,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), - imm:$cc), + timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), - imm:$cc)>, + timm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, @@ -2523,38 +2547,37 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, "$cc, ${src2}"#_.BroadcastStr#", $src1", "$src1, ${src2}"#_.BroadcastStr#", $cc", (X86cmpm (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc), + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc)>, + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; // Patterns for selecting with loads in other operand. def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), - CommutableCMPCC:$cc), + timm:$cc), (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2), (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + timm:$cc)), (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), CommutableCMPCC:$cc), + def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2), + (_.VT _.RC:$src1), timm:$cc), (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + timm:$cc)), (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; } multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -2564,9 +2587,9 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1", "$src1, $src2, {sae}, $cc", - (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, + timm:$cc)>, EVEX_B, Sched<[sched]>; } @@ -2590,12 +2613,12 @@ defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>, // Patterns to select fp compares with load as first operand. let Predicates = [HasAVX512] in { def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, - CommutableCMPCC:$cc)), - (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>; + timm:$cc)), + (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, - CommutableCMPCC:$cc)), - (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>; + timm:$cc)), + (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; } // ---------------------------------------------------------------- @@ -2621,7 +2644,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1), - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), @@ -2629,7 +2652,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclasss_su (_.VT _.RC:$src1), - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.IntScalarMemOp:$src1, i32u8imm:$src2), @@ -2637,7 +2660,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, (X86Vfpclasss _.ScalarIntMemCPat:$src1, - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2), @@ -2645,7 +2668,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclasss_su _.ScalarIntMemCPat:$src1, - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -2661,7 +2684,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1), - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), @@ -2669,7 +2692,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su (_.VT _.RC:$src1), - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), @@ -2677,7 +2700,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT (_.LdFrag addr:$src1)), - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), @@ -2685,7 +2708,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su (_.VT (_.LdFrag addr:$src1)), - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), @@ -2693,9 +2716,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, _.BroadcastStr##", $dst|$dst, ${src1}" ##_.BroadcastStr##", $src2}", [(set _.KRC:$dst,(X86Vfpclass - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2)))]>, + (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2)))]>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), @@ -2703,9 +2725,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## _.BroadcastStr##", $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2))))]>, + (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2))))]>, EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2836,13 +2857,21 @@ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), (KMOVWrk VK16:$src)>; +def : Pat<(i64 (zext (i16 (bitconvert (v16i1 VK16:$src))))), + (SUBREG_TO_REG (i64 0), (KMOVWrk VK16:$src), sub_32bit)>; def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), (COPY_TO_REGCLASS VK16:$src, GR32)>; +def : Pat<(i64 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), + (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK16:$src, GR32), sub_32bit)>; def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), (KMOVBrk VK8:$src)>, Requires<[HasDQI]>; +def : Pat<(i64 (zext (i8 (bitconvert (v8i1 VK8:$src))))), + (SUBREG_TO_REG (i64 0), (KMOVBrk VK8:$src), sub_32bit)>, Requires<[HasDQI]>; def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), (COPY_TO_REGCLASS VK8:$src, GR32)>; +def : Pat<(i64 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), + (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK8:$src, GR32), sub_32bit)>; def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (COPY_TO_REGCLASS GR32:$src, VK32)>; @@ -3075,7 +3104,7 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), !strconcat(OpcodeStr, "\t{$imm, $src, $dst|$dst, $src, $imm}"), - [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>, + [(set KRC:$dst, (OpNode KRC:$src, (i8 timm:$imm)))]>, Sched<[sched]>; } @@ -3098,30 +3127,6 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, - string InstStr, - X86VectorVTInfo Narrow, - X86VectorVTInfo Wide> { - def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2))), - (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr#"Zrr") - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; - - def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag_su (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2)))), - (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr#"Zrrk") - (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; -} - -// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, string InstStr, X86VectorVTInfo Narrow, @@ -3129,7 +3134,7 @@ multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), (Narrow.VT Narrow.RC:$src2), cond)), (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr##Zrri) + (!cast<Instruction>(InstStr#"Zrri") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), (Frag.OperandTransform $cc)), Narrow.KRC)>; @@ -3138,53 +3143,111 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1), (Narrow.VT Narrow.RC:$src2), cond)))), - (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik") (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), - (Frag.OperandTransform $cc)), Narrow.KRC)>; + (Frag_su.OperandTransform $cc)), Narrow.KRC)>; +} + +multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, + PatFrag CommFrag, PatFrag CommFrag_su, + string InstStr, + X86VectorVTInfo Narrow, + X86VectorVTInfo Wide> { +// Broadcast load. +def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.BroadcastLdFrag addr:$src2), cond)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmib") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Narrow.KVT + (Frag_su:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.BroadcastLdFrag addr:$src2), + cond)))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>; + +// Commuted with broadcast load. +def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2), + (Narrow.VT Narrow.RC:$src1), + cond)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmib") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Narrow.KVT + (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2), + (Narrow.VT Narrow.RC:$src1), + cond)))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>; } // Same as above, but for fp types which don't use PatFrags. -multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su, - string InstStr, +multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { -def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), imm:$cc)), +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), timm:$cc)), (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr##Zrri) + (!cast<Instruction>(InstStr#"Zrri") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), - imm:$cc), Narrow.KRC)>; + timm:$cc), Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (OpNode_su (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), imm:$cc))), - (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (X86cmpm_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), timm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik") (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), - imm:$cc), Narrow.KRC)>; -} + timm:$cc), Narrow.KRC)>; -let Predicates = [HasAVX512, NoVLX] in { - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't - // increase the pattern complexity the way an immediate would. - let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>; +// Broadcast load. +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmbi") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, timm:$cc), Narrow.KRC)>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>; +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (X86cmpm_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, timm:$cc), Narrow.KRC)>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>; +// Commuted with broadcast load. +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), + (Narrow.VT Narrow.RC:$src1), timm:$cc)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmbi") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>; - } +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (X86cmpm_su (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), + (Narrow.VT Narrow.RC:$src1), timm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; +} +let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>; @@ -3197,29 +3260,25 @@ let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>; -} + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>; -let Predicates = [HasBWI, NoVLX] in { - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't - // increase the pattern complexity the way an immediate would. - let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>; - } + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v4f64x_info, v8f64_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>; +} +let Predicates = [HasBWI, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>; @@ -4186,16 +4245,32 @@ def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)), (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>; +def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), (f32 FR32X:$src0))), + (COPY_TO_REGCLASS + (v4f32 (VMOVSSZrmk (v4f32 (COPY_TO_REGCLASS FR32X:$src0, VR128X)), + VK1WM:$mask, addr:$src)), + FR32X)>; +def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), fp32imm0)), + (COPY_TO_REGCLASS (v4f32 (VMOVSSZrmkz VK1WM:$mask, addr:$src)), FR32X)>; + def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)), VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; -def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)), +def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fp64imm0)), (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; +def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))), + (COPY_TO_REGCLASS + (v2f64 (VMOVSDZrmk (v2f64 (COPY_TO_REGCLASS FR64X:$src0, VR128X)), + VK1WM:$mask, addr:$src)), + FR64X)>; +def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)), + (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), @@ -4537,8 +4612,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (_.BroadcastLdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4664,8 +4738,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, "${src2}"##_Brdct.BroadcastStr##", $src1", "$src1, ${src2}"##_Brdct.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Brdct.VT (X86VBroadcast - (_Brdct.ScalarLdFrag addr:$src2))))))>, + (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4737,8 +4810,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast - (_Src.ScalarLdFrag addr:$src2))))))>, + (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4874,22 +4946,11 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - - def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), (EXTRACT_SUBREG - (VPMULLQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), - sub_xmm)>; -} - -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasDQI, NoVLX] in { - def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), - (EXTRACT_SUBREG - (VPMULLQZrr + (VPMULLQZrmb (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + addr:$src2), sub_ymm)>; def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), @@ -4898,29 +4959,47 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (VPMULLQZrmb + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } -multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> { +multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> { def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast<Instruction>(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; + def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast<Instruction>(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + addr:$src2), + sub_ymm)>; def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast<Instruction>(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast<Instruction>(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } let Predicates = [HasAVX512, NoVLX] in { - defm : avx512_min_max_lowering<VPMAXUQZrr, umax>; - defm : avx512_min_max_lowering<VPMINUQZrr, umin>; - defm : avx512_min_max_lowering<VPMAXSQZrr, smax>; - defm : avx512_min_max_lowering<VPMINSQZrr, smin>; + defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; + defm : avx512_min_max_lowering<"VPMINUQZ", umin>; + defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; + defm : avx512_min_max_lowering<"VPMINSQZ", smin>; } //===----------------------------------------------------------------------===// @@ -4977,32 +5056,6 @@ let Predicates = [HasVLX] in { def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)), (VPANDNQZ128rm VR128X:$src1, addr:$src2)>; - def : Pat<(and VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDDZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(or VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPORDZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(xor VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPXORDZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>; - - def : Pat<(and VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(or VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPORQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(xor VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPXORQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)), (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)), @@ -5042,32 +5095,6 @@ let Predicates = [HasVLX] in { (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)), (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; - - def : Pat<(and VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDDZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(or VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPORDZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(xor VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPXORDZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>; - - def : Pat<(and VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDQZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(or VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPORQZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(xor VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPXORQZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>; } let Predicates = [HasAVX512] in { @@ -5110,32 +5137,6 @@ let Predicates = [HasAVX512] in { (VPANDNQZrm VR512:$src1, addr:$src2)>; def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)), (VPANDNQZrm VR512:$src1, addr:$src2)>; - - def : Pat<(and VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDDZrmb VR512:$src1, addr:$src2)>; - def : Pat<(or VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPORDZrmb VR512:$src1, addr:$src2)>; - def : Pat<(xor VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPXORDZrmb VR512:$src1, addr:$src2)>; - def : Pat<(X86andnp VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDNDZrmb VR512:$src1, addr:$src2)>; - - def : Pat<(and VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDQZrmb VR512:$src1, addr:$src2)>; - def : Pat<(or VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPORQZrmb VR512:$src1, addr:$src2)>; - def : Pat<(xor VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPXORQZrmb VR512:$src1, addr:$src2)>; - def : Pat<(X86andnp VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDNQZrmb VR512:$src1, addr:$src2)>; } // Patterns to catch vselect with different type than logic op. @@ -5174,25 +5175,17 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode, X86VectorVTInfo _, X86VectorVTInfo IntInfo> { // Register-broadcast logical operations. - def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), - (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.RC:$src0)), (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.ImmAllZerosV)), (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -5329,7 +5322,8 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, SDNode SaeNode, - X86FoldableSchedWrite sched, bit IsCommutable> { + X86FoldableSchedWrite sched, bit IsCommutable, + string EVEX2VexOvrd> { let ExeDomain = _.ExeDomain in { defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -5349,7 +5343,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]> { + Sched<[sched]>, + EVEX2VEXOverride<EVEX2VexOvrd#"rr"> { let isCommutable = IsCommutable; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5357,7 +5352,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, + EVEX2VEXOverride<EVEX2VexOvrd#"rm">; } defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -5387,10 +5383,12 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode VecNode, SDNode SaeNode, X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode, - VecNode, SaeNode, sched.PS.Scl, IsCommutable>, + VecNode, SaeNode, sched.PS.Scl, IsCommutable, + NAME#"SS">, XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode, - VecNode, SaeNode, sched.PD.Scl, IsCommutable>, + VecNode, SaeNode, sched.PD.Scl, IsCommutable, + NAME#"SD">, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds, @@ -5410,13 +5408,14 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, - X86FoldableSchedWrite sched> { + X86FoldableSchedWrite sched, + string EVEX2VEXOvrd> { let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]> { + Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> { let isCommutable = 1; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5424,24 +5423,27 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; } } defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, - SchedWriteFCmp.Scl>, XS, EVEX_4V, - VEX_LIG, EVEX_CD8<32, CD8VT1>; + SchedWriteFCmp.Scl, "VMINCSS">, XS, + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, - SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V, - VEX_LIG, EVEX_CD8<64, CD8VT1>; + SchedWriteFCmp.Scl, "VMINCSD">, XD, + VEX_W, EVEX_4V, VEX_LIG, + EVEX_CD8<64, CD8VT1>; defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, - SchedWriteFCmp.Scl>, XS, EVEX_4V, - VEX_LIG, EVEX_CD8<32, CD8VT1>; + SchedWriteFCmp.Scl, "VMAXCSS">, XS, + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, - SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V, - VEX_LIG, EVEX_CD8<64, CD8VT1>; + SchedWriteFCmp.Scl, "VMAXCSD">, XD, + VEX_W, EVEX_4V, VEX_LIG, + EVEX_CD8<64, CD8VT1>; multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, @@ -5464,8 +5466,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5595,8 +5596,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5751,13 +5751,13 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>, + (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>, Sched<[sched]>; defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)), - (i8 imm:$src2)))>, + (i8 timm:$src2)))>, Sched<[sched.Folded]>; } } @@ -5769,7 +5769,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", - (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>, + (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>, EVEX_B, Sched<[sched.Folded]>; } @@ -5911,17 +5911,17 @@ let Predicates = [HasAVX512, NoVLX] in { (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), VR128X:$src2)), sub_xmm)>; - def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))), + def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; - def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; + timm:$src2)), sub_xmm)>; } //===-------------------------------------------------------------------===// @@ -5953,8 +5953,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))>, + (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6062,27 +6061,27 @@ let Predicates = [HasAVX512, NoVLX] in { (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; - def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; - def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPROLDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPROLDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; } // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. @@ -6113,27 +6112,27 @@ let Predicates = [HasAVX512, NoVLX] in { (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; - def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; - def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPRORDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPRORDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; } //===-------------------------------------------------------------------===// @@ -6228,8 +6227,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (Ctrl.VT (X86VBroadcast - (Ctrl.ScalarLdFrag addr:$src2)))))>, + (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6419,7 +6417,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, + _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6493,7 +6491,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6571,7 +6569,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, - (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6964,7 +6962,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -7504,14 +7502,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, OpcodeStr, "${src}"##Broadcast, "${src}"##Broadcast, (_.VT (OpNode (_Src.VT - (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + (_Src.BroadcastLdFrag addr:$src)) )), (vselect MaskRC:$mask, (_.VT (OpNode (_Src.VT - (X86VBroadcast - (_Src.ScalarLdFrag addr:$src))))), + (_Src.BroadcastLdFrag addr:$src)))), _.RC:$src0), vselect, "$src0 = $dst">, EVEX, EVEX_B, Sched<[sched.Folded]>; @@ -7646,14 +7643,14 @@ let Predicates = [HasAVX512] in { v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZrmb addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), (v8f32 VR256X:$src0)), (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; } @@ -7677,14 +7674,14 @@ let Predicates = [HasVLX] in { v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; - def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZ256rmb addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), VR128X:$src0), (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; @@ -7708,12 +7705,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))), + def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))), (VCVTPD2PSZ128rmb addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8194,12 +8191,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8223,12 +8220,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8252,12 +8249,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8281,12 +8278,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8419,12 +8416,12 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8448,12 +8445,12 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTUQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8576,21 +8573,21 @@ let ExeDomain = GenericDomain in { (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _dest.RC:$dst, - (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>, + (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, Sched<[RR]>; let Constraints = "$src0 = $dst" in def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _dest.RC:$dst, - (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), + (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2), _dest.RC:$src0, _src.KRCWM:$mask))]>, Sched<[RR]>, EVEX_K; def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}", [(set _dest.RC:$dst, - (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), + (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2), _dest.ImmAllZerosV, _src.KRCWM:$mask))]>, Sched<[RR]>, EVEX_KZ; let hasSideEffects = 0, mayStore = 1 in { @@ -8631,17 +8628,17 @@ let Predicates = [HasAVX512] in { } def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst), - (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>; - def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst), - (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>; + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>; + def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -8765,7 +8762,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8859,7 +8856,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8940,7 +8937,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (fsqrt (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -9049,14 +9046,14 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3)))>, + (i32 timm:$src3)))>, Sched<[sched]>; defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3)))>, EVEX_B, + (i32 timm:$src3)))>, EVEX_B, Sched<[sched]>; defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -9064,7 +9061,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales _.RC:$src1, - _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>, + _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -9082,15 +9079,15 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, } let Predicates = [HasAVX512] in { - def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2), + def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2), (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src1, imm:$src2))>; + _.FRC:$src1, timm:$src2))>; } let Predicates = [HasAVX512, OptForSize] in { - def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2), + def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2), (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src1, imm:$src2))>; + addr:$src1, timm:$src2))>; } } @@ -10109,19 +10106,19 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2))>, Sched<[sched]>; + (i32 timm:$src2))>, Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2))>, + (i32 timm:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", - (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2))>, EVEX_B, + (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10136,7 +10133,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, OpcodeStr##_.Suffix, "$src2, {sae}, $src1", "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2))>, + (i32 timm:$src2))>, EVEX_B, Sched<[sched]>; } @@ -10169,22 +10166,22 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i32 imm:$src3))>, + (i32 timm:$src3))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3))>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src2)), + (i32 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10200,7 +10197,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), (SrcInfo.VT SrcInfo.RC:$src2), - (i8 imm:$src3)))>, + (i8 timm:$src3)))>, Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), @@ -10208,7 +10205,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), (SrcInfo.VT (bitconvert (SrcInfo.LdFrag addr:$src2))), - (i8 imm:$src3)))>, + (i8 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10226,8 +10223,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3))>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src2)), + (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10241,15 +10238,14 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, Sched<[sched]>; defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3))>, + (_.VT _.ScalarIntMemCPat:$src2), + (i32 timm:$src3))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10265,7 +10261,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, EVEX_B, Sched<[sched]>; } @@ -10279,7 +10275,7 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, EVEX_B, Sched<[sched]>; } @@ -10401,7 +10397,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2, - (i8 imm:$src3)))))>, + (i8 timm:$src3)))))>, Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), @@ -10410,7 +10406,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, (CastInfo.LdFrag addr:$src2), - (i8 imm:$src3)))))>, + (i8 timm:$src3)))))>, Sched<[sched.Folded, sched.ReadAfterFold]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -10421,8 +10417,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (i8 imm:$src3)))))>, EVEX_B, + (_.BroadcastLdFrag addr:$src2), + (i8 timm:$src3)))))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10491,14 +10487,14 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>, + (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>, Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)), - (i8 imm:$src3)))>, + (i8 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>, EVEX2VEXOverride<"VPALIGNRrmi">; @@ -10507,8 +10503,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (X86VAlign _.RC:$src1, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3))>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src2)), + (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10541,13 +10537,13 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", // Fragments to help convert valignq into masked valignd. Or valignq/valignd // into vpalignr. -def ValignqImm32XForm : SDNodeXForm<imm, [{ +def ValignqImm32XForm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() * 2, SDLoc(N)); }]>; -def ValignqImm8XForm : SDNodeXForm<imm, [{ +def ValignqImm8XForm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() * 8, SDLoc(N)); }]>; -def ValigndImm8XForm : SDNodeXForm<imm, [{ +def ValigndImm8XForm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() * 4, SDLoc(N)); }]>; @@ -10557,40 +10553,40 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode, def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, From.RC:$src2, - imm:$src3))), + timm:$src3))), To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, To.RC:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, From.RC:$src2, - imm:$src3))), + timm:$src3))), To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask, To.RC:$src1, To.RC:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (From.LdFrag addr:$src2), - imm:$src3))), + timm:$src3))), To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (From.LdFrag addr:$src2), - imm:$src3))), + timm:$src3))), To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; } multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, @@ -10599,35 +10595,32 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, SDNodeXForm ImmXForm> : avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> { def : Pat<(From.VT (OpNode From.RC:$src1, - (bitconvert (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), - imm:$src3)), + (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3)), (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (bitconvert - (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), - imm:$src3))), + (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3))), To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (bitconvert - (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), - imm:$src3))), + (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3))), To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; } let Predicates = [HasAVX512] in { @@ -10666,13 +10659,13 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase, + (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1)))))>, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; } @@ -10685,8 +10678,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, - (_.VT (OpNode (X86VBroadcast - (_.ScalarLdFrag addr:$src1))))>, + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; } @@ -10770,7 +10762,7 @@ let Predicates = [HasAVX512, NoVLX] in { multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd, NoVLX] in { - def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))), (EXTRACT_SUBREG (!cast<Instruction>(InstrStr # "Zrr") (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), @@ -10778,7 +10770,7 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, _.info256.SubRegIdx)), _.info256.SubRegIdx)>; - def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1))), (EXTRACT_SUBREG (!cast<Instruction>(InstrStr # "Zrr") (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), @@ -10829,17 +10821,16 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX, + (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", - (_.VT (OpNode (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src)))))>, + (_.VT (_.BroadcastLdFrag addr:$src))>, EVEX, EVEX_CD8<_.EltSize, CD8VH>, Sched<[sched.Folded]>; } @@ -10853,7 +10844,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX512, HasVLX] in { defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM, + defm Z128 : avx512_movddup_128<opc, OpcodeStr, sched.XMM, VTInfo.info128>, EVEX_V128; } } @@ -10867,11 +10858,9 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode, defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>; let Predicates = [HasVLX] in { -def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), - (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; @@ -10884,17 +10873,17 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -11070,14 +11059,14 @@ multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, def rr : AVX512<opc, MRMr, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>, Sched<[sched]>; def rm : AVX512<opc, MRMm, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst,(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i8 imm:$src2))))]>, + (i8 timm:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -11104,6 +11093,7 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _dst, X86VectorVTInfo _src> { + let isCommutable = 1 in def rr : AVX512BI<opc, MRMSrcReg, (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -11140,7 +11130,7 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", // Transforms to swizzle an immediate to enable better matching when // memory operand isn't in the right place. -def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG321_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2. uint8_t Imm = N->getZExtValue(); // Swap bits 1/4 and 3/6. @@ -11151,7 +11141,7 @@ def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x40) NewImm |= 0x08; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG213_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. uint8_t Imm = N->getZExtValue(); // Swap bits 2/4 and 3/5. @@ -11162,7 +11152,7 @@ def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x20) NewImm |= 0x08; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG132_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. uint8_t Imm = N->getZExtValue(); // Swap bits 1/2 and 5/6. @@ -11173,7 +11163,7 @@ def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x40) NewImm |= 0x20; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG231_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by moving operand 1 to the end. uint8_t Imm = N->getZExtValue(); // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5 @@ -11186,7 +11176,7 @@ def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x40) NewImm |= 0x20; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG312_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by moving operand 2 to the beginning. uint8_t Imm = N->getZExtValue(); // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3 @@ -11210,7 +11200,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT _.RC:$src3), - (i8 imm:$src4)), 1, 1>, + (i8 timm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V, Sched<[sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), @@ -11218,7 +11208,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (bitconvert (_.LdFrag addr:$src3))), - (i8 imm:$src4)), 1, 0>, + (i8 timm:$src4)), 1, 0>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -11227,146 +11217,145 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr##", $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - (i8 imm:$src4)), 1, 0>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src3)), + (i8 timm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; }// Constraints = "$src1 = $dst" // Additional patterns for matching passthru operand in other positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)), + (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>; // Additional patterns for matching loads in other positions. def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4))), + _.RC:$src2, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching zero masking with loads in other // positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching masked loads with different // operand orders. def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, - (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)), + (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>; // Additional patterns for matching broadcasts in other positions. - def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4))), + (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching zero masking with broadcasts in other // positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, - (VPTERNLOG321_imm8 imm:$src4))>; + (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, - (VPTERNLOG132_imm8 imm:$src4))>; + (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching masked broadcasts with different // operand orders. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - (i8 imm:$src4)), _.RC:$src1)), + (_.BroadcastLdFrag addr:$src3), + (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src1, (i8 imm:$src4)), + (_.BroadcastLdFrag addr:$src3), + _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>; } multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched, @@ -11387,6 +11376,113 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU, defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, avx512vl_i64_info>, VEX_W; +// Patterns to use VPTERNLOG for vXi16/vXi8 vectors. +let Predicates = [HasVLX] in { + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, + timm:$src4)>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, + (loadv16i8 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2, + VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, + timm:$src4)>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, + (loadv8i16 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2, + VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, + timm:$src4)>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, + (loadv32i8 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2, + VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, + timm:$src4)>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, + (loadv16i16 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2, + VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, + timm:$src4)>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, + (loadv64i8 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2, + VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, + (loadv32i16 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2, + VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; +} + // Patterns to implement vnot using vpternlog instead of creating all ones // using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen // so that the result is only dependent on src0. But we use the same source @@ -11498,14 +11594,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT _.RC:$src3), - (i32 imm:$src4))>, Sched<[sched]>; + (i32 timm:$src4))>, Sched<[sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), - (i32 imm:$src4))>, + (i32 timm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), @@ -11513,8 +11609,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, "$src2, ${src3}"##_.BroadcastStr##", $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), - (i32 imm:$src4))>, + (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)), + (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Constraints = "$src1 = $dst" } @@ -11531,7 +11627,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { (X86VFixupimmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT _.RC:$src3), - (i32 imm:$src4))>, + (i32 timm:$src4))>, EVEX_B, Sched<[sched]>; } } @@ -11547,7 +11643,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (X86VFixupimms (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), - (i32 imm:$src4))>, Sched<[sched]>; + (i32 timm:$src4))>, Sched<[sched]>; defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -11555,7 +11651,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (X86VFixupimmSAEs (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), - (i32 imm:$src4))>, + (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), @@ -11564,13 +11660,13 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (_.VT _.RC:$src2), (_src3VT.VT (scalar_to_vector (_src3VT.ScalarLdFrag addr:$src3))), - (i32 imm:$src4))>, + (i32 timm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched, - AVX512VLVectorVTInfo _Vec, + AVX512VLVectorVTInfo _Vec, AVX512VLVectorVTInfo _Tbl> { let Predicates = [HasAVX512] in defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM, @@ -11804,7 +11900,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -11880,12 +11976,14 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256, let Constraints = "$src1 = $dst" in multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { + X86FoldableSchedWrite sched, X86VectorVTInfo VTI, + bit IsCommutable> { defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, "$src3, $src2", "$src2, $src3", (VTI.VT (OpNode VTI.RC:$src1, - VTI.RC:$src2, VTI.RC:$src3))>, + VTI.RC:$src2, VTI.RC:$src3)), + IsCommutable, IsCommutable>, EVEX_4V, T8PD, Sched<[sched]>; defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, @@ -11899,27 +11997,58 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, OpStr, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast - (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, - X86SchedWriteWidths sched> { + X86SchedWriteWidths sched, bit IsCommutable> { let Predicates = [HasVNNI] in - defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512; + defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info, + IsCommutable>, EVEX_V512; let Predicates = [HasVNNI, HasVLX] in { - defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256; - defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128; + defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info, + IsCommutable>, EVEX_V256; + defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info, + IsCommutable>, EVEX_V128; } } // FIXME: Is there a better scheduler class for VPDP? -defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>; -defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>; -defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>; -defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>; +defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>; +defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>; +defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>; +defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>; + +def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), + (X86vpmaddwd node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// Patterns to match VPDPWSSD from existing instructions/intrinsics. +let Predicates = [HasVNNI] in { + def : Pat<(v16i32 (add VR512:$src1, + (X86vpmaddwd_su VR512:$src2, VR512:$src3))), + (VPDPWSSDZr VR512:$src1, VR512:$src2, VR512:$src3)>; + def : Pat<(v16i32 (add VR512:$src1, + (X86vpmaddwd_su VR512:$src2, (load addr:$src3)))), + (VPDPWSSDZm VR512:$src1, VR512:$src2, addr:$src3)>; +} +let Predicates = [HasVNNI,HasVLX] in { + def : Pat<(v8i32 (add VR256X:$src1, + (X86vpmaddwd_su VR256X:$src2, VR256X:$src3))), + (VPDPWSSDZ256r VR256X:$src1, VR256X:$src2, VR256X:$src3)>; + def : Pat<(v8i32 (add VR256X:$src1, + (X86vpmaddwd_su VR256X:$src2, (load addr:$src3)))), + (VPDPWSSDZ256m VR256X:$src1, VR256X:$src2, addr:$src3)>; + def : Pat<(v4i32 (add VR128X:$src1, + (X86vpmaddwd_su VR128X:$src2, VR128X:$src3))), + (VPDPWSSDZ128r VR128X:$src1, VR128X:$src2, VR128X:$src3)>; + def : Pat<(v4i32 (add VR128X:$src1, + (X86vpmaddwd_su VR128X:$src2, (load addr:$src3)))), + (VPDPWSSDZ128m VR128X:$src1, VR128X:$src2, addr:$src3)>; +} //===----------------------------------------------------------------------===// // Bit Algorithms @@ -12004,8 +12133,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode, OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", (OpNode (VTI.VT VTI.RC:$src1), - (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))), - (i8 imm:$src3))>, EVEX_B, + (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))), + (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12116,7 +12245,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), [(set _.KRPC:$dst, (X86vp2intersect - _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, + _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>, EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; } @@ -12217,12 +12346,12 @@ let Predicates = [HasBF16, HasVLX] in { (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 - (X86VBroadcast (loadf32 addr:$src))))), + (X86VBroadcastld32 addr:$src)))), (VCVTNEPS2BF16Z128rmb addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), (v8i16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), v8i16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; } @@ -12249,7 +12378,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr), (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>, + (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>, EVEX_B, EVEX_4V; } diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index e52635f8d48b..1e399a894490 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1271,22 +1271,22 @@ let isCompare = 1 in { // ANDN Instruction // multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, - PatFrag ld_frag> { + PatFrag ld_frag, X86FoldableSchedWrite sched> { def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>, - Sched<[WriteALU]>; + Sched<[sched]>; def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>, - Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } // Complexity is reduced to give and with immediate a chance to match first. let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in { - defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V; - defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W; + defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V; + defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, VEX_W; } let Predicates = [HasBMI], AddedComplexity = -6 in { diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index 50aed98112c3..aa45e9b191c1 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -131,11 +131,11 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) { /// reference. static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg) { - // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg. - MI->getOperand(Operand).setReg(Reg); + // Direct memory address is in a form of: Reg/FI, 1 (Scale), NoReg, 0, NoReg. + MI->getOperand(Operand).ChangeToRegister(Reg, /*isDef=*/false); MI->getOperand(Operand + 1).setImm(1); MI->getOperand(Operand + 2).setReg(0); - MI->getOperand(Operand + 3).setImm(0); + MI->getOperand(Operand + 3).ChangeToImmediate(0); MI->getOperand(Operand + 4).setReg(0); } diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index 099f6aa8d8bb..330b8c7a8a43 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -20,19 +20,19 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond), "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>, + (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>, TB, OpSize16; def CMOV32rr : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond), "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>, + (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>, TB, OpSize32; def CMOV64rr :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond), "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, - (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB; + (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB; } let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", @@ -41,29 +41,46 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond), "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), - imm:$cond, EFLAGS))]>, TB, OpSize16; + timm:$cond, EFLAGS))]>, TB, OpSize16; def CMOV32rm : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond), "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), - imm:$cond, EFLAGS))]>, TB, OpSize32; + timm:$cond, EFLAGS))]>, TB, OpSize32; def CMOV64rm :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond), "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), - imm:$cond, EFLAGS))]>, TB; + timm:$cond, EFLAGS))]>, TB; } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" } // isCodeGenOnly = 1, ForceDisassemble = 1 +def inv_cond_XFORM : SDNodeXForm<imm, [{ + X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue()); + return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC), + SDLoc(N), MVT::i8); +}]>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS), + (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS), + (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS), + (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; +} + // SetCC instructions. let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in { def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond), "set${cond}\t$dst", - [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>, + [(set GR8:$dst, (X86setcc timm:$cond, EFLAGS))]>, TB, Sched<[WriteSETCC]>; def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond), "set${cond}\t$dst", - [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>, + [(store (X86setcc timm:$cond, EFLAGS), addr:$dst)]>, TB, Sched<[WriteSETCCStore]>; } // Uses = [EFLAGS] diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index efaccdc9ee96..78d8dd3c0d03 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -542,7 +542,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { def CMOV#NAME : I<0, Pseudo, (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), "#CMOV_"#NAME#" PSEUDO!", - [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond, EFLAGS)))]>; } @@ -593,66 +593,66 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _VK64 : CMOVrr_PSEUDO<VK64, v64i1>; } // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] -def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; +def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; let Predicates = [NoVLX] in { - def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; + def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; - def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; + def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; } let Predicates = [HasVLX] in { - def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; + def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; - def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; + def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; } -def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; +def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions @@ -1126,12 +1126,12 @@ def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), // binary size compared to a regular MOV, but it introduces an unnecessary // load, so is not suitable for regular or optsize functions. let Predicates = [OptForMinSize] in { -def : Pat<(nonvolatile_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>; -def : Pat<(nonvolatile_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>; -def : Pat<(nonvolatile_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>; -def : Pat<(nonvolatile_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>; -def : Pat<(nonvolatile_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>; -def : Pat<(nonvolatile_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>; +def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>; +def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>; +def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>; +def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>; +def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>; +def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>; } // In kernel code model, we can get the address of a label @@ -1276,23 +1276,6 @@ def : Pat<(X86cmp GR32:$src1, 0), def : Pat<(X86cmp GR64:$src1, 0), (TEST64rr GR64:$src1, GR64:$src1)>; -def inv_cond_XFORM : SDNodeXForm<imm, [{ - X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue()); - return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC), - SDLoc(N), MVT::i8); -}]>; - -// Conditional moves with folded loads with operands swapped and conditions -// inverted. -let Predicates = [HasCMov] in { - def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS), - (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; - def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS), - (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; - def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS), - (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; -} - // zextload bool -> zextload byte // i1 stored in one byte in zero-extended form. // Upper bits cleanup should be executed before Store. diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index f82e80965b7c..e1e6eea59884 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -75,7 +75,7 @@ let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump], def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs), (ins brtarget8:$dst, ccode:$cond), "j${cond}\t$dst", - [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>; + [(X86brcond bb:$dst, timm:$cond, EFLAGS)]>; let hasSideEffects = 0 in { def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs), (ins brtarget16:$dst, ccode:$cond), @@ -145,6 +145,17 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; + // Win64 wants indirect jumps leaving the function to have a REX_W prefix. + // These are switched from TAILJMPr/m64_REX in MCInstLower. + let isCodeGenOnly = 1, hasREX_WPrefix = 1 in { + def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst), + "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>; + let mayLoad = 1 in + def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), + "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>; + + } + // Non-tracking jumps for IBT, use with caution. let isCodeGenOnly = 1 in { def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst", @@ -273,39 +284,35 @@ let isCall = 1 in // Tail call stuff. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in - let Uses = [ESP, SSP] in { - def TCRETURNdi : PseudoI<(outs), - (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable; - def TCRETURNri : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; + isCodeGenOnly = 1, Uses = [ESP, SSP] in { + def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>, NotMemoryFoldable; + def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>, NotMemoryFoldable; let mayLoad = 1 in - def TCRETURNmi : PseudoI<(outs), - (ins i32mem_TC:$dst, i32imm:$offset), []>; + def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset), + []>, Sched<[WriteJumpLd]>; - // FIXME: The should be pseudo instructions that are lowered when going to - // mcinst. - def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i32imm_pcrel:$dst), "jmp\t$dst", []>; + def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst), + []>, Sched<[WriteJump]>; - def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "", []>; // FIXME: Remove encoding when JIT is dead. + def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), + []>, Sched<[WriteJump]>; let mayLoad = 1 in - def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), - "jmp{l}\t{*}$dst", []>; + def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst), + []>, Sched<[WriteJumpLd]>; } // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, - isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + isCodeGenOnly = 1, SchedRW = [WriteJump] in let Uses = [ESP, EFLAGS, SSP] in { def TCRETURNdicc : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; // This gets substituted to a conditional jump instruction in MC lowering. - def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs), - (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>; + def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>; } @@ -348,34 +355,36 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { + isCodeGenOnly = 1, Uses = [RSP, SSP] in { def TCRETURNdi64 : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset), - []>; + (ins i64i32imm_pcrel:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>; def TCRETURNri64 : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; + (ins ptr_rc_tailcall:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>, NotMemoryFoldable; let mayLoad = 1 in def TCRETURNmi64 : PseudoI<(outs), - (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable; + (ins i64mem_TC:$dst, i32imm:$offset), + []>, Sched<[WriteJumpLd]>, NotMemoryFoldable; - def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), - "jmp\t$dst", []>; + def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst), + []>, Sched<[WriteJump]>; - def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "jmp{q}\t{*}$dst", []>; + def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), + []>, Sched<[WriteJump]>; let mayLoad = 1 in - def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), - "jmp{q}\t{*}$dst", []>; + def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst), + []>, Sched<[WriteJumpLd]>; // Win64 wants indirect jumps leaving the function to have a REX_W prefix. let hasREX_WPrefix = 1 in { - def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "rex64 jmp{q}\t{*}$dst", []>; + def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), + []>, Sched<[WriteJump]>; let mayLoad = 1 in - def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), - "rex64 jmp{q}\t{*}$dst", []>; + def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst), + []>, Sched<[WriteJumpLd]>; } } @@ -403,13 +412,13 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, - isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + isCodeGenOnly = 1, SchedRW = [WriteJump] in let Uses = [RSP, EFLAGS, SSP] in { def TCRETURNdi64cc : PseudoI<(outs), (ins i64i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; // This gets substituted to a conditional jump instruction in MC lowering. - def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs), - (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>; + def TAILJMPd64_CC : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>; } diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 06e605fe5db2..7a4eb138ec34 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -17,19 +17,18 @@ let hasSideEffects = 0 in { let Defs = [EAX], Uses = [AX] in // EAX = signext(AX) def CWDE : I<0x98, RawFrm, (outs), (ins), "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>; + let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) + def CDQE : RI<0x98, RawFrm, (outs), (ins), + "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; + // FIXME: CWD/CDQ/CQO shouldn't Def the A register, but the fast register + // allocator crashes if you remove it. let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX) def CWD : I<0x99, RawFrm, (outs), (ins), "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>; let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX) def CDQ : I<0x99, RawFrm, (outs), (ins), "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>; - - - let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) - def CDQE : RI<0x98, RawFrm, (outs), (ins), - "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; - let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX) def CQO : RI<0x99, RawFrm, (outs), (ins), "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp index d42fec3770c7..f3b286e0375c 100644 --- a/lib/Target/X86/X86InstrFoldTables.cpp +++ b/lib/Target/X86/X86InstrFoldTables.cpp @@ -292,6 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD }, { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, { X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD }, + { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, @@ -5245,6 +5247,270 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, }; +static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { + { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD }, + { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS }, + { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS }, + { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD }, + { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD }, + { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD }, + { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS }, + { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS }, + { X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS }, + { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD }, + { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD }, + { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD }, + { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS }, + { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS }, + { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS }, + { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD }, + { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD }, + { X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD }, + { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS }, + { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS }, + { X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS }, + { X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD }, + { X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD }, + { X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD }, + { X86::VMAXPSZ128rr, X86::VMAXPSZ128rmb, TB_BCAST_SS }, + { X86::VMAXPSZ256rr, X86::VMAXPSZ256rmb, TB_BCAST_SS }, + { X86::VMAXPSZrr, X86::VMAXPSZrmb, TB_BCAST_SS }, + { X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD }, + { X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD }, + { X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD }, + { X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS }, + { X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS }, + { X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS }, + { X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD }, + { X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD }, + { X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD }, + { X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS }, + { X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS }, + { X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS }, + { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD }, + { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD }, + { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD }, + { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS }, + { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS }, + { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS }, + { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D }, + { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D }, + { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D }, + { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q }, + { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q }, + { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q }, + { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D }, + { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D }, + { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D }, + { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D }, + { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D }, + { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D }, + { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q }, + { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q }, + { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q }, + { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q }, + { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q }, + { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q }, + { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmib, TB_BCAST_D }, + { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmib, TB_BCAST_D }, + { X86::VPCMPDZrri, X86::VPCMPDZrmib, TB_BCAST_D }, + { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D }, + { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D }, + { X86::VPCMPEQDZrr, X86::VPCMPEQDZrmb, TB_BCAST_D }, + { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rmb, TB_BCAST_Q }, + { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rmb, TB_BCAST_Q }, + { X86::VPCMPEQQZrr, X86::VPCMPEQQZrmb, TB_BCAST_Q }, + { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rmb, TB_BCAST_D }, + { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rmb, TB_BCAST_D }, + { X86::VPCMPGTDZrr, X86::VPCMPGTDZrmb, TB_BCAST_D }, + { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q }, + { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q }, + { X86::VPCMPGTQZrr, X86::VPCMPGTQZrmb, TB_BCAST_Q }, + { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmib, TB_BCAST_Q }, + { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmib, TB_BCAST_Q }, + { X86::VPCMPQZrri, X86::VPCMPQZrmib, TB_BCAST_Q }, + { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D }, + { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D }, + { X86::VPCMPUDZrri, X86::VPCMPUDZrmib, TB_BCAST_D }, + { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q }, + { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q }, + { X86::VPCMPUQZrri, X86::VPCMPUQZrmib, TB_BCAST_Q }, + { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rmb, TB_BCAST_D }, + { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rmb, TB_BCAST_D }, + { X86::VPMAXSDZrr, X86::VPMAXSDZrmb, TB_BCAST_D }, + { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rmb, TB_BCAST_Q }, + { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rmb, TB_BCAST_Q }, + { X86::VPMAXSQZrr, X86::VPMAXSQZrmb, TB_BCAST_Q }, + { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rmb, TB_BCAST_D }, + { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rmb, TB_BCAST_D }, + { X86::VPMAXUDZrr, X86::VPMAXUDZrmb, TB_BCAST_D }, + { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rmb, TB_BCAST_Q }, + { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rmb, TB_BCAST_Q }, + { X86::VPMAXUQZrr, X86::VPMAXUQZrmb, TB_BCAST_Q }, + { X86::VPMINSDZ128rr, X86::VPMINSDZ128rmb, TB_BCAST_D }, + { X86::VPMINSDZ256rr, X86::VPMINSDZ256rmb, TB_BCAST_D }, + { X86::VPMINSDZrr, X86::VPMINSDZrmb, TB_BCAST_D }, + { X86::VPMINSQZ128rr, X86::VPMINSQZ128rmb, TB_BCAST_Q }, + { X86::VPMINSQZ256rr, X86::VPMINSQZ256rmb, TB_BCAST_Q }, + { X86::VPMINSQZrr, X86::VPMINSQZrmb, TB_BCAST_Q }, + { X86::VPMINUDZ128rr, X86::VPMINUDZ128rmb, TB_BCAST_D }, + { X86::VPMINUDZ256rr, X86::VPMINUDZ256rmb, TB_BCAST_D }, + { X86::VPMINUDZrr, X86::VPMINUDZrmb, TB_BCAST_D }, + { X86::VPMINUQZ128rr, X86::VPMINUQZ128rmb, TB_BCAST_Q }, + { X86::VPMINUQZ256rr, X86::VPMINUQZ256rmb, TB_BCAST_Q }, + { X86::VPMINUQZrr, X86::VPMINUQZrmb, TB_BCAST_Q }, + { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D }, + { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D }, + { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D }, + { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q }, + { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q }, + { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q }, + { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D }, + { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D }, + { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D }, + { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q }, + { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q }, + { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q }, + { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D }, + { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D }, + { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D }, + { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q }, + { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q }, + { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q }, + { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D }, + { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D }, + { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D }, + { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q }, + { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q }, + { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q }, + { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D }, + { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D }, + { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D }, + { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q }, + { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q }, + { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q }, + { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD }, + { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD }, + { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD }, + { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS }, + { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS }, + { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS }, +}; + +static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = { + { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD }, + { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS }, + { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD }, + { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS }, + { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD }, + { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD231PSZr, X86::VFMADD231PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS }, + { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD }, + { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS }, + { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD }, + { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS }, + { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD }, + { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB231PSZr, X86::VFMSUB231PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZmb, TB_BCAST_SS }, + { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD }, + { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS }, + { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD }, + { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS }, + { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD }, + { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS }, +}; + static const X86MemoryFoldTableEntry * lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { #ifndef NDEBUG @@ -5287,6 +5553,18 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { std::end(MemoryFoldTable4)) == std::end(MemoryFoldTable4) && "MemoryFoldTable4 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) && + std::adjacent_find(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) == + std::end(BroadcastFoldTable2) && + "BroadcastFoldTable2 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable3), + std::end(BroadcastFoldTable3)) && + std::adjacent_find(std::begin(BroadcastFoldTable3), + std::end(BroadcastFoldTable3)) == + std::end(BroadcastFoldTable3) && + "BroadcastFoldTable3 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } #endif @@ -5355,6 +5633,15 @@ struct X86MemUnfoldTable { // Index 4, folded load addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD); + // Broadcast tables. + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + // Sort the memory->reg unfold table. array_pod_sort(Table.begin(), Table.end()); diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h index 419baf98f61d..7dc236a0d7e4 100644 --- a/lib/Target/X86/X86InstrFoldTables.h +++ b/lib/Target/X86/X86InstrFoldTables.h @@ -19,35 +19,48 @@ namespace llvm { enum { // Select which memory operand is being unfolded. - // (stored in bits 0 - 3) + // (stored in bits 0 - 2) TB_INDEX_0 = 0, TB_INDEX_1 = 1, TB_INDEX_2 = 2, TB_INDEX_3 = 3, TB_INDEX_4 = 4, - TB_INDEX_MASK = 0xf, + TB_INDEX_MASK = 0x7, // Do not insert the reverse map (MemOp -> RegOp) into the table. // This may be needed because there is a many -> one mapping. - TB_NO_REVERSE = 1 << 4, + TB_NO_REVERSE = 1 << 3, // Do not insert the forward map (RegOp -> MemOp) into the table. // This is needed for Native Client, which prohibits branch // instructions from using a memory operand. - TB_NO_FORWARD = 1 << 5, + TB_NO_FORWARD = 1 << 4, - TB_FOLDED_LOAD = 1 << 6, - TB_FOLDED_STORE = 1 << 7, + TB_FOLDED_LOAD = 1 << 5, + TB_FOLDED_STORE = 1 << 6, + TB_FOLDED_BCAST = 1 << 7, // Minimum alignment required for load/store. - // Used for RegOp->MemOp conversion. - // (stored in bits 8 - 15) + // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0 + // to mean align of 0. + // (stored in bits 8 - 11) TB_ALIGN_SHIFT = 8, - TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, - TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, - TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, - TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, - TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 5 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 6 << TB_ALIGN_SHIFT, + TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT, + + // Broadcast type. + // (stored in bits 12 - 13) + TB_BCAST_TYPE_SHIFT = 12, + TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT, + + // Unused bits 14-15 }; // This struct is used for both the folding and unfold tables. They KeyOp diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 096cc27861ca..de6f8a81dff6 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,6 +103,8 @@ def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -954,6 +956,26 @@ def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr), return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; }]>; +def X86VBroadcastld8 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 1; +}]>; + +def X86VBroadcastld16 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2; +}]>; + +def X86VBroadcastld32 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4; +}]>; + +def X86VBroadcastld64 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; +}]>; + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); @@ -963,6 +985,10 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{ return N->isExactlyValue(+0.0); }]>; +def fp128imm0 : PatLeaf<(f128 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index // to VEXTRACTF128/VEXTRACTI128 imm. def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index dbe45356c42b..c29029daeec9 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -30,7 +30,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -465,7 +465,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. - if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) + if (!Register::isVirtualRegister(BaseReg)) return false; bool isPICBase = false; for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), @@ -480,9 +480,50 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { } bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const { + AAResults *AA) const { switch (MI.getOpcode()) { - default: break; + default: + // This function should only be called for opcodes with the ReMaterializable + // flag set. + llvm_unreachable("Unknown rematerializable operation!"); + break; + + case X86::LOAD_STACK_GUARD: + case X86::AVX1_SETALLONES: + case X86::AVX2_SETALLONES: + case X86::AVX512_128_SET0: + case X86::AVX512_256_SET0: + case X86::AVX512_512_SET0: + case X86::AVX512_512_SETALLONES: + case X86::AVX512_FsFLD0SD: + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0F128: + case X86::AVX_SET0: + case X86::FsFLD0SD: + case X86::FsFLD0SS: + case X86::FsFLD0F128: + case X86::KSET0D: + case X86::KSET0Q: + case X86::KSET0W: + case X86::KSET1D: + case X86::KSET1Q: + case X86::KSET1W: + case X86::MMX_SET0: + case X86::MOV32ImmSExti8: + case X86::MOV32r0: + case X86::MOV32r1: + case X86::MOV32r_1: + case X86::MOV32ri64: + case X86::MOV64ImmSExti8: + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::MOV16ri: + case X86::MOV32ri: + case X86::MOV64ri: + case X86::MOV64ri32: + case X86::MOV8ri: + return true; + case X86::MOV8rm: case X86::MOV8rm_NOREX: case X86::MOV16rm: @@ -561,7 +602,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && MI.isDereferenceableInvariantLoad(AA)) { - unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); + Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; // Allow re-materialization of PIC load. @@ -583,7 +624,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, // lea fi#, lea GV, etc. are all rematerializable. if (!MI.getOperand(1 + X86::AddrBaseReg).isReg()) return true; - unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); + Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0) return true; // Allow re-materialization of lea PICBase + x. @@ -594,10 +635,6 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return false; } } - - // All other instructions marked M_REMATERIALIZABLE are always trivially - // rematerializable. - return true; } void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, @@ -664,7 +701,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { } bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, - unsigned Opc, bool AllowSP, unsigned &NewSrc, + unsigned Opc, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const { MachineFunction &MF = *MI.getParent()->getParent(); @@ -675,7 +712,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; } - unsigned SrcReg = Src.getReg(); + Register SrcReg = Src.getReg(); // For both LEA64 and LEA32 the register already has essentially the right // type (32-bit or 64-bit) we may just need to forbid SP. @@ -684,7 +721,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, isKill = Src.isKill(); assert(!Src.isUndef() && "Undef op doesn't need optimization"); - if (TargetRegisterInfo::isVirtualRegister(NewSrc) && + if (Register::isVirtualRegister(NewSrc) && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) return false; @@ -693,7 +730,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // This is for an LEA64_32r and incoming registers are 32-bit. One way or // another we need to add 64-bit registers to the final MI. - if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + if (Register::isPhysicalRegister(SrcReg)) { ImplicitOp = Src; ImplicitOp.setImplicit(); @@ -740,8 +777,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( return nullptr; unsigned Opcode = X86::LEA64_32r; - unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); - unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); + Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); // Build and insert into an implicit UNDEF value. This is OK because // we will be shifting and then extracting the lower 8/16-bits. @@ -751,8 +788,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( // But testing has shown this *does* help performance in 64-bit mode (at // least on modern x86 machines). MachineBasicBlock::iterator MBBI = MI.getIterator(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); bool IsDead = MI.getOperand(0).isDead(); bool IsKill = MI.getOperand(1).isKill(); unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; @@ -794,7 +831,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( case X86::ADD8rr_DB: case X86::ADD16rr: case X86::ADD16rr_DB: { - unsigned Src2 = MI.getOperand(2).getReg(); + Register Src2 = MI.getOperand(2).getReg(); bool IsKill2 = MI.getOperand(2).isKill(); assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization"); unsigned InRegLEA2 = 0; @@ -888,7 +925,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + if (Register::isVirtualRegister(Src.getReg()) && !MF.getRegInfo().constrainRegClass(Src.getReg(), &X86::GR64_NOSPRegClass)) return nullptr; @@ -911,7 +948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // LEA can't handle ESP. bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) @@ -947,7 +984,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) @@ -970,7 +1007,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) @@ -1005,7 +1042,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) @@ -1013,7 +1050,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, const MachineOperand &Src2 = MI.getOperand(2); bool isKill2; - unsigned SrcReg2; + Register SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, SrcReg2, isKill2, ImplicitOp2, LV)) @@ -1054,7 +1091,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) @@ -1085,6 +1122,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return nullptr; case X86::SUB32ri8: case X86::SUB32ri: { + if (!MI.getOperand(2).isImm()) + return nullptr; int64_t Imm = MI.getOperand(2).getImm(); if (!isInt<32>(-Imm)) return nullptr; @@ -1093,7 +1132,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) @@ -1111,6 +1150,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SUB64ri8: case X86::SUB64ri32: { + if (!MI.getOperand(2).isImm()) + return nullptr; int64_t Imm = MI.getOperand(2).getImm(); if (!isInt<32>(-Imm)) return nullptr; @@ -1140,40 +1181,62 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk: case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: - case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: { + case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: + case X86::VBROADCASTSDZ256mk: + case X86::VBROADCASTSDZmk: + case X86::VBROADCASTSSZ128mk: + case X86::VBROADCASTSSZ256mk: + case X86::VBROADCASTSSZmk: + case X86::VPBROADCASTDZ128mk: + case X86::VPBROADCASTDZ256mk: + case X86::VPBROADCASTDZmk: + case X86::VPBROADCASTQZ128mk: + case X86::VPBROADCASTQZ256mk: + case X86::VPBROADCASTQZmk: { unsigned Opc; switch (MIOpc) { default: llvm_unreachable("Unreachable!"); - case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; - case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; - case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; - case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; - case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; - case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; - case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; - case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; - case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; - case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; - case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; - case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; - case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; - case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; - case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; - case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; - case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; - case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; - case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; - case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; - case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; - case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; - case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; - case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; - case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; - case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; - case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; - case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; - case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; - case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; + case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; + case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; + case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; + case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; + case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; + case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break; + case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break; + case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break; + case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break; + case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break; + case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break; + case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break; + case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break; + case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break; + case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break; + case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break; } NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) @@ -1187,6 +1250,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .add(MI.getOperand(7)); break; } + case X86::VMOVDQU8Z128rrk: case X86::VMOVDQU8Z256rrk: case X86::VMOVDQU8Zrrk: @@ -1683,6 +1747,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::VCMPSDZrr: + case X86::VCMPSSZrr: + case X86::VCMPPDZrri: + case X86::VCMPPSZrri: + case X86::VCMPPDZ128rri: + case X86::VCMPPSZ128rri: + case X86::VCMPPDZ256rri: + case X86::VCMPPSZ256rri: + case X86::VCMPPDZrrik: + case X86::VCMPPSZrrik: + case X86::VCMPPDZ128rrik: + case X86::VCMPPSZ128rrik: + case X86::VCMPPDZ256rrik: + case X86::VCMPPSZ256rrik: { + unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f; + Imm = X86::getSwappedVCMPImm(Imm); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::VPERM2F128rr: case X86::VPERM2I128rr: { // Flip permute source immediate. @@ -1859,7 +1944,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. - unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); + Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); unsigned CommutableOpIdx1; for (CommutableOpIdx1 = LastCommutableVecOp; @@ -1889,7 +1974,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, return true; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, +bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { const MCInstrDesc &Desc = MI.getDesc(); if (!Desc.isCommutable()) @@ -1926,17 +2012,23 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; switch (Imm) { + default: + // EVEX versions can be commuted. + if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX) + break; + return false; case 0x00: // EQUAL case 0x03: // UNORDERED case 0x04: // NOT EQUAL case 0x07: // ORDERED - // The indices of the commutable operands are 1 and 2 (or 2 and 3 - // when masked). - // Assign them to the returned operand indices here. - return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, - 2 + OpOffset); + break; } - return false; + + // The indices of the commutable operands are 1 and 2 (or 2 and 3 + // when masked). + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, + 2 + OpOffset); } case X86::MOVSSrr: // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can @@ -1990,6 +2082,24 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + case X86::VPDPWSSDZ128r: + case X86::VPDPWSSDZ128rk: + case X86::VPDPWSSDZ128rkz: + case X86::VPDPWSSDZ256r: + case X86::VPDPWSSDZ256rk: + case X86::VPDPWSSDZ256rkz: + case X86::VPDPWSSDZr: + case X86::VPDPWSSDZrk: + case X86::VPDPWSSDZrkz: + case X86::VPDPWSSDSZ128r: + case X86::VPDPWSSDSZ128rk: + case X86::VPDPWSSDSZ128rkz: + case X86::VPDPWSSDSZ256r: + case X86::VPDPWSSDSZ256rk: + case X86::VPDPWSSDSZ256rkz: + case X86::VPDPWSSDSZr: + case X86::VPDPWSSDSZrk: + case X86::VPDPWSSDSZrkz: case X86::VPMADD52HUQZ128r: case X86::VPMADD52HUQZ128rk: case X86::VPMADD52HUQZ128rkz: @@ -2215,7 +2325,7 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) { } } -/// Get the VPCMP immediate if the opcodes are swapped. +/// Get the VPCMP immediate if the operands are swapped. unsigned X86::getSwappedVPCMPImm(unsigned Imm) { switch (Imm) { default: llvm_unreachable("Unreachable!"); @@ -2233,7 +2343,7 @@ unsigned X86::getSwappedVPCMPImm(unsigned Imm) { return Imm; } -/// Get the VPCOM immediate if the opcodes are swapped. +/// Get the VPCOM immediate if the operands are swapped. unsigned X86::getSwappedVPCOMImm(unsigned Imm) { switch (Imm) { default: llvm_unreachable("Unreachable!"); @@ -2251,6 +2361,23 @@ unsigned X86::getSwappedVPCOMImm(unsigned Imm) { return Imm; } +/// Get the VCMP immediate if the operands are swapped. +unsigned X86::getSwappedVCMPImm(unsigned Imm) { + // Only need the lower 2 bits to distinquish. + switch (Imm & 0x3) { + default: llvm_unreachable("Unreachable!"); + case 0x00: case 0x03: + // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted. + break; + case 0x01: case 0x02: + // Need to toggle bits 3:0. Bit 4 stays the same. + Imm ^= 0xf; + break; + } + + return Imm; +} + bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const { if (!MI.isTerminator()) return false; @@ -3131,25 +3258,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(isKill)); } -void X86InstrInfo::storeRegToAddr( - MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const { - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); - DebugLoc DL; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.add(Addr[i]); - MIB.addReg(SrcReg, getKillRegState(isKill)); - MIB.setMemRefs(MMOs); - NewMIs.push_back(MIB); -} - - void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIdx, @@ -3164,23 +3272,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); } -void X86InstrInfo::loadRegFromAddr( - MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const { - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); - DebugLoc DL; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.add(Addr[i]); - MIB.setMemRefs(MMOs); - NewMIs.push_back(MIB); -} - bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { @@ -3599,8 +3690,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (!IsCmpZero && !Sub) return false; - bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && - Sub->getOperand(2).getReg() == SrcReg); + bool IsSwapped = + (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg); // Scan forward from the instruction after CmpInstr for uses of EFLAGS. // It is safe to remove CmpInstr if EFLAGS is redefined or killed. @@ -3755,7 +3847,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; // Do not fold if we have a subreg use or a def. @@ -3785,7 +3877,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any @@ -3815,7 +3907,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); // Insert the XOR. BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) @@ -3891,7 +3983,7 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); auto Flags = MachineMemOperand::MOLoad | @@ -3929,7 +4021,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx) { - unsigned DestReg = MIB->getOperand(0).getReg(); + Register DestReg = MIB->getOperand(0).getReg(); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(DestReg) < 16) { // We can use a normal VEX encoded load. @@ -3952,7 +4044,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx) { - unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); + Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(SrcReg) < 16) { // We can use a normal VEX encoded store. @@ -4008,12 +4100,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: + case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { assert(HasAVX && "AVX not supported"); const TargetRegisterInfo *TRI = &getRegisterInfo(); - unsigned SrcReg = MIB->getOperand(0).getReg(); - unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); + Register SrcReg = MIB->getOperand(0).getReg(); + Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(X86::VXORPSrr)); MIB.addReg(SrcReg, RegState::ImplicitDefine); @@ -4021,9 +4114,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case X86::AVX512_128_SET0: case X86::AVX512_FsFLD0SS: - case X86::AVX512_FsFLD0SD: { + case X86::AVX512_FsFLD0SD: + case X86::AVX512_FsFLD0F128: { bool HasVLX = Subtarget.hasVLX(); - unsigned SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB->getOperand(0).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) return Expand2AddrUndef(MIB, @@ -4037,10 +4131,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: { bool HasVLX = Subtarget.hasVLX(); - unsigned SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB->getOperand(0).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) { - unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); + Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); @@ -4060,14 +4154,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); case X86::AVX1_SETALLONES: { - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. MIB->setDesc(get(X86::VCMPPSYrri)); MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); return true; } case X86::AVX512_512_SETALLONES: { - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); MIB->setDesc(get(X86::VPTERNLOGDZrri)); // VPTERNLOGD needs 3 register inputs and an immediate. // 0xff will return 1s for any input. @@ -4077,8 +4171,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case X86::AVX512_512_SEXT_MASK_32: case X86::AVX512_512_SEXT_MASK_64: { - unsigned Reg = MIB->getOperand(0).getReg(); - unsigned MaskReg = MIB->getOperand(1).getReg(); + Register Reg = MIB->getOperand(0).getReg(); + Register MaskReg = MIB->getOperand(1).getReg(); unsigned MaskState = getRegState(MIB->getOperand(1)); unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; @@ -4115,8 +4209,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::MOV32ri64: { - unsigned Reg = MIB->getOperand(0).getReg(); - unsigned Reg32 = RI.getSubReg(Reg, X86::sub_32bit); + Register Reg = MIB->getOperand(0).getReg(); + Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit); MI.setDesc(get(X86::MOV32ri)); MIB->getOperand(0).setReg(Reg32); MIB.addReg(Reg, RegState::ImplicitDefine); @@ -4251,8 +4345,8 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // If MI is marked as reading Reg, the partial register update is wanted. const MachineOperand &MO = MI.getOperand(0); - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + Register Reg = MO.getReg(); + if (Register::isVirtualRegister(Reg)) { if (MO.readsReg() || MI.readsVirtualRegister(Reg)) return 0; } else { @@ -4268,7 +4362,10 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. -static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { +static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum, + bool ForLoadFold = false) { + // Set the OpNum parameter to the first source operand. + OpNum = 1; switch (Opcode) { case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: @@ -4427,6 +4524,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: return true; + case X86::VMOVSSZrrk: + case X86::VMOVSDZrrk: + OpNum = 3; + return true; + case X86::VMOVSSZrrkz: + case X86::VMOVSDZrrkz: + OpNum = 2; + return true; } return false; @@ -4449,14 +4554,11 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { unsigned X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, const TargetRegisterInfo *TRI) const { - if (!hasUndefRegUpdate(MI.getOpcode())) + if (!hasUndefRegUpdate(MI.getOpcode(), OpNum)) return 0; - // Set the OpNum parameter to the first source operand. - OpNum = 1; - const MachineOperand &MO = MI.getOperand(OpNum); - if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) { return UndefRegClearance; } return 0; @@ -4464,7 +4566,7 @@ X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, void X86InstrInfo::breakPartialRegDependency( MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - unsigned Reg = MI.getOperand(OpNum).getReg(); + Register Reg = MI.getOperand(OpNum).getReg(); // If MI kills this register, the false dependence is already broken. if (MI.killsRegister(Reg, TRI)) return; @@ -4480,7 +4582,7 @@ void X86InstrInfo::breakPartialRegDependency( } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. - unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef) .addReg(XReg, RegState::Undef) @@ -4489,7 +4591,7 @@ void X86InstrInfo::breakPartialRegDependency( } else if (X86::GR64RegClass.contains(Reg)) { // Using XOR32rr because it has shorter encoding and zeros up the upper bits // as well. - unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit); + Register XReg = TRI->getSubReg(Reg, X86::sub_32bit); BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg) .addReg(XReg, RegState::Undef) .addReg(XReg, RegState::Undef) @@ -4538,8 +4640,8 @@ static void updateOperandRegConstraints(MachineFunction &MF, // We only need to update constraints on virtual register operands. if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!TRI.isVirtualRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg)) continue; auto *NewRC = MRI.constrainRegClass( @@ -4698,7 +4800,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { - if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) || + unsigned Ignored; + if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) || !MI.getOperand(1).isReg()) return false; @@ -4788,6 +4891,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (I != nullptr) { unsigned Opcode = I->DstOp; unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0; if (Align < MinAlign) return nullptr; bool NarrowToMOV32rm = false; @@ -4821,8 +4925,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If this is the special case where we use a MOV32rm to load a 32-bit // value and zero-extend the top bits. Change the destination register // to a 32-bit one. - unsigned DstReg = NewMI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + Register DstReg = NewMI->getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg)) NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); @@ -5133,6 +5237,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX512_128_SET0: + case X86::FsFLD0F128: + case X86::AVX512_FsFLD0F128: Alignment = 16; break; case X86::MMX_SET0: @@ -5182,7 +5288,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: case X86::FsFLD0SS: - case X86::AVX512_FsFLD0SS: { + case X86::AVX512_FsFLD0SS: + case X86::FsFLD0F128: + case X86::AVX512_FsFLD0F128: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -5212,6 +5320,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Ty = Type::getFloatTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction().getContext()); + else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) + Ty = Type::getFP128Ty(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || @@ -5293,6 +5403,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { return StoreMMOs; } +static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I, + const TargetRegisterClass *RC, + const X86Subtarget &STI) { + assert(STI.hasAVX512() && "Expected at least AVX512!"); + unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC); + assert((SpillSize == 64 || STI.hasVLX()) && + "Can't broadcast less than 64 bytes without AVX512VL!"); + + switch (I->Flags & TB_BCAST_MASK) { + default: llvm_unreachable("Unexpected broadcast type!"); + case TB_BCAST_D: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTDZ128m; + case 32: return X86::VPBROADCASTDZ256m; + case 64: return X86::VPBROADCASTDZm; + } + break; + case TB_BCAST_Q: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTQZ128m; + case 32: return X86::VPBROADCASTQZ256m; + case 64: return X86::VPBROADCASTQZm; + } + break; + case TB_BCAST_SS: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VBROADCASTSSZ128m; + case 32: return X86::VBROADCASTSSZ256m; + case 64: return X86::VBROADCASTSSZm; + } + break; + case TB_BCAST_SD: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VMOVDDUPZ128rm; + case 32: return X86::VBROADCASTSDZ256m; + case 64: return X86::VBROADCASTSDZm; + } + break; + } +} + bool X86InstrInfo::unfoldMemoryOperand( MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const { @@ -5303,6 +5458,7 @@ bool X86InstrInfo::unfoldMemoryOperand( unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; if (UnfoldLoad && !FoldedLoad) return false; UnfoldLoad &= FoldedLoad; @@ -5311,7 +5467,9 @@ bool X86InstrInfo::unfoldMemoryOperand( UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && Subtarget.isUnalignedMem16Slow()) @@ -5335,10 +5493,26 @@ bool X86InstrInfo::unfoldMemoryOperand( AfterOps.push_back(Op); } - // Emit the load instruction. + // Emit the load or broadcast instruction. if (UnfoldLoad) { auto MMOs = extractLoadMMOs(MI.memoperands(), MF); - loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs, NewMIs); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); + } + + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg); + for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) + MIB.add(AddrOps[i]); + MIB.setMemRefs(MMOs); + NewMIs.push_back(MIB); + if (UnfoldStore) { // Address operands cannot be marked isKill. for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { @@ -5404,7 +5578,16 @@ bool X86InstrInfo::unfoldMemoryOperand( if (UnfoldStore) { const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); - storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs, NewMIs); + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); + for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) + MIB.add(AddrOps[i]); + MIB.addReg(Reg, RegState::Kill); + MIB.setMemRefs(MMOs); + NewMIs.push_back(MIB); } return true; @@ -5423,6 +5606,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -5456,10 +5640,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, - VT, MVT::Other, AddrOps); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); + } + + Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. @@ -7367,6 +7558,96 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { } } +Optional<ParamLoadedValue> +X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const { + const MachineOperand *Op = nullptr; + DIExpression *Expr = nullptr; + + switch (MI.getOpcode()) { + case X86::LEA32r: + case X86::LEA64r: + case X86::LEA64_32r: { + // Operand 4 could be global address. For now we do not support + // such situation. + if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm()) + return None; + + const MachineOperand &Op1 = MI.getOperand(1); + const MachineOperand &Op2 = MI.getOperand(3); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister || + Register::isPhysicalRegister(Op2.getReg()))); + + // Omit situations like: + // %rsi = lea %rsi, 4, ... + if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) || + Op2.getReg() == MI.getOperand(0).getReg()) + return None; + else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister && + TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) || + (Op2.getReg() != X86::NoRegister && + TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg()))) + return None; + + int64_t Coef = MI.getOperand(2).getImm(); + int64_t Offset = MI.getOperand(4).getImm(); + SmallVector<uint64_t, 8> Ops; + + if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) { + Op = &Op1; + } else if (Op1.isFI()) + Op = &Op1; + + if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) { + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(Coef + 1); + Ops.push_back(dwarf::DW_OP_mul); + } else { + if (Op && Op2.getReg() != X86::NoRegister) { + int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false); + if (dwarfReg < 0) + return None; + else if (dwarfReg < 32) { + Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg); + Ops.push_back(0); + } else { + Ops.push_back(dwarf::DW_OP_bregx); + Ops.push_back(dwarfReg); + Ops.push_back(0); + } + } else if (!Op) { + assert(Op2.getReg() != X86::NoRegister); + Op = &Op2; + } + + if (Coef > 1) { + assert(Op2.getReg() != X86::NoRegister); + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(Coef); + Ops.push_back(dwarf::DW_OP_mul); + } + + if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) && + Op2.getReg() != X86::NoRegister) { + Ops.push_back(dwarf::DW_OP_plus); + } + } + + DIExpression::appendOffset(Ops, Offset); + Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops); + + return ParamLoadedValue(*Op, Expr);; + } + case X86::XOR32rr: { + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) + return ParamLoadedValue(MachineOperand::CreateImm(0), Expr); + return None; + } + default: + return TargetInstrInfo::describeLoadedValue(MI); + } +} + /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, @@ -7500,9 +7781,8 @@ namespace { // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx // addq %rcx, %rax // RAX now holds address of _GLOBAL_OFFSET_TABLE_. - unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); - unsigned GOTReg = - RegInfo.createVirtualRegister(&X86::GR64RegClass); + Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); + Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg) .addReg(X86::RIP) .addImm(0) diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 13ca17139494..22b7b1d4cb19 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -67,6 +67,9 @@ unsigned getSwappedVPCMPImm(unsigned Imm); /// Get the VPCOM immediate if the opcodes are swapped. unsigned getSwappedVPCOMImm(unsigned Imm); +/// Get the VCMP immediate if the opcodes are swapped. +unsigned getSwappedVCMPImm(unsigned Imm); + } // namespace X86 /// isGlobalStubReference - Return true if the specified TargetFlag operand is @@ -203,7 +206,7 @@ public: int &FrameIndex) const override; bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const override; + AAResults *AA) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, const MachineInstr &Orig, @@ -218,7 +221,7 @@ public: /// Reference parameters are set to indicate how caller should add this /// operand to the LEA instruction. bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, - unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, + unsigned LEAOpcode, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const; @@ -251,7 +254,7 @@ public: /// findCommutedOpIndices(MI, Op1, Op2); /// can be interpreted as a query asking to find an operand that would be /// commutable with the operand#1. - bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, + bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; /// Returns an adjusted FMA opcode that must be used in FMA instruction that @@ -317,23 +320,11 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const; - void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const; - bool expandPostRAPseudo(MachineInstr &MI) const override; /// Check whether the target can fold a load that feeds a subreg operand @@ -527,6 +518,13 @@ public: #define GET_INSTRINFO_HELPER_DECLS #include "X86GenInstrInfo.inc" + static bool hasLockPrefix(const MachineInstr &MI) { + return MI.getDesc().TSFlags & X86II::LOCK; + } + + Optional<ParamLoadedValue> + describeLoadedValue(const MachineInstr &MI) const override; + protected: /// Commutes the operands in the given instruction by changing the operands /// order and/or changing the instruction's opcode and/or the immediate value diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 8e05dd8ec5c1..e452145f3b65 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -673,6 +673,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { ImmSExti64i32AsmOperand]; } +// 4-bit immediate used by some XOP instructions +// [0, 0xF] +def ImmUnsignedi4AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi4"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidImmUnsignedi4"; +} + // Unsigned immediate used by SSE/AVX instructions // [0, 0xFF] // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] @@ -705,6 +713,13 @@ def i64i8imm : Operand<i64> { let OperandType = "OPERAND_IMMEDIATE"; } +// Unsigned 4-bit immediate used by some XOP instructions. +def u4imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi4AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // Unsigned 8-bit immediate used by SSE/AVX instructions. def u8imm : Operand<i8> { let PrintMethod = "printU8Imm"; @@ -925,7 +940,6 @@ def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">; def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; -def HasMPX : Predicate<"Subtarget->hasMPX()">; def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">; def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCLWB : Predicate<"Subtarget->hasCLWB()">; @@ -1103,7 +1117,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ if (ExtType == ISD::NON_EXTLOAD) return true; if (ExtType == ISD::EXTLOAD) - return LD->getAlignment() >= 2 && !LD->isVolatile(); + return LD->getAlignment() >= 2 && LD->isSimple(); return false; }]>; @@ -1113,7 +1127,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ if (ExtType == ISD::NON_EXTLOAD) return true; if (ExtType == ISD::EXTLOAD) - return LD->getAlignment() >= 4 && !LD->isVolatile(); + return LD->getAlignment() >= 4 && LD->isSimple(); return false; }]>; @@ -1170,7 +1184,7 @@ def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [ if (LD->getMemoryVT() == MVT::i32) return true; - return LD->getAlignment() >= 4 && !LD->isVolatile(); + return LD->getAlignment() >= 4 && LD->isSimple(); }]>; @@ -2404,25 +2418,26 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in { } multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, - RegisterClass RC, X86MemOperand x86memop> { + RegisterClass RC, X86MemOperand x86memop, + X86FoldableSchedWrite sched> { let hasSideEffects = 0 in { def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, - T8PS, VEX_4V, Sched<[WriteBLS]>; + T8PS, VEX_4V, Sched<[sched]>; let mayLoad = 1 in def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, - T8PS, VEX_4V, Sched<[WriteBLS.Folded]>; + T8PS, VEX_4V, Sched<[sched.Folded]>; } } let Predicates = [HasBMI], Defs = [EFLAGS] in { - defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>; - defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W; - defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>; - defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W; - defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>; - defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W; + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W; } //===----------------------------------------------------------------------===// @@ -2683,12 +2698,12 @@ def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst", multiclass lwpins_intr<RegisterClass RC> { def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>, + [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>, + [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>, XOP_4V, XOPA; } @@ -2700,11 +2715,11 @@ let Defs = [EFLAGS] in { multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> { def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA; + [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>, + [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>, XOP_4V, XOPA; } @@ -3205,13 +3220,13 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", - (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", - (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", - (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", - (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; // clr aliases. def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 57835b1a256a..cd9a866c91cb 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -30,7 +30,6 @@ def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>; let Constraints = "$src1 = $dst" in { // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. - // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, X86FoldableSchedWrite sched, bit Commutable = 0, X86MemOperand OType = i64mem> { @@ -67,7 +66,7 @@ let Constraints = "$src1 = $dst" in { def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), (ins VR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>, + [(set VR64:$dst, (IntId2 VR64:$src1, timm:$src2))]>, Sched<[schedImm]>; } } @@ -114,13 +113,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId, def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 timm:$src3)))]>, Sched<[sched]>; def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, + (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -496,14 +495,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, - (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>, + (int_x86_sse_pshuf_w VR64:$src1, timm:$src2))]>, Sched<[SchedWriteShuffle.MMX]>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), - imm:$src2))]>, + timm:$src2))]>, Sched<[SchedWriteShuffle.MMX.Folded]>; // -- Conversion Instructions @@ -535,7 +534,7 @@ def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, - imm:$src2))]>, + timm:$src2))]>, Sched<[WriteVecExtract]>; let Constraints = "$src1 = $dst" in { let Predicates = [HasMMX, HasSSE1] in { @@ -544,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in { (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, - GR32orGR64:$src2, imm:$src3))]>, + GR32orGR64:$src2, timm:$src3))]>, Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, @@ -553,7 +552,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), - imm:$src3))]>, + timm:$src3))]>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } } @@ -567,6 +566,13 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (int_x86_mmx_pmovmskb VR64:$src))]>, Sched<[WriteMMXMOVMSK]>; +// MMX to XMM for vector types +def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1, + [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>; + +def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)), + (v2i64 (MMX_MOVQ2DQrr VR64:$src))>; + // Low word of XMM to MMX. def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; @@ -574,9 +580,13 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), +def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))), (x86mmx (MMX_MOVQ64rm addr:$src))>; +def : Pat<(v2i64 (X86vzmovl (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src)))))), + (MMX_MOVQ2DQrr VR64:$src)>; + // Misc. let SchedRW = [SchedWriteShuffle.MMX] in { let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in @@ -602,9 +612,6 @@ def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))), (MMX_CVTTPS2PIirr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q - (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))), - (MMX_CVTTPS2PIirr VR128:$src)>; -def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), (MMX_CVTPD2PIirr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td index f7d931510fe2..44ba071947c2 100644 --- a/lib/Target/X86/X86InstrMPX.td +++ b/lib/Target/X86/X86InstrMPX.td @@ -12,16 +12,16 @@ // //===----------------------------------------------------------------------===// -// FIXME: Investigate a better scheduler class once MPX is used inside LLVM. +// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM. let SchedRW = [WriteSystem] in { multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, - Requires<[HasMPX, Not64BitMode]>; + Requires<[Not64BitMode]>; def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, - Requires<[HasMPX, In64BitMode]>; + Requires<[In64BitMode]>; } defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; @@ -29,17 +29,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, Not64BitMode]>; + Requires<[Not64BitMode]>; def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, In64BitMode]>; + Requires<[In64BitMode]>; def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, Not64BitMode]>; + Requires<[Not64BitMode]>; def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, In64BitMode]>; + Requires<[In64BitMode]>; } defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable; defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable; @@ -47,33 +47,31 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable; def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX]>, NotMemoryFoldable; + NotMemoryFoldable; let mayLoad = 1 in { def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable; + Requires<[Not64BitMode]>, NotMemoryFoldable; def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable; + Requires<[In64BitMode]>, NotMemoryFoldable; } let isCodeGenOnly = 1, ForceDisassemble = 1 in def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX]>, NotMemoryFoldable; + NotMemoryFoldable; let mayStore = 1 in { def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable; + Requires<[Not64BitMode]>, NotMemoryFoldable; def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable; + Requires<[In64BitMode]>, NotMemoryFoldable; def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src), - "bndstx\t{$src, $dst|$dst, $src}", []>, PS, - Requires<[HasMPX]>; + "bndstx\t{$src, $dst|$dst, $src}", []>, PS; } let mayLoad = 1 in def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), - "bndldx\t{$src, $dst|$dst, $src}", []>, PS, - Requires<[HasMPX]>; + "bndldx\t{$src, $dst|$dst, $src}", []>, PS; } // SchedRW diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7d0a5b87baf4..09a04c0338b4 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -115,7 +115,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; + [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; + def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; } //===----------------------------------------------------------------------===// @@ -128,13 +130,18 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero] in { + isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4f32 immAllZerosV))]>; } -let Predicates = [NoAVX512] in +let Predicates = [NoAVX512] in { +def : Pat<(v16i8 immAllZerosV), (V_SET0)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>; +def : Pat<(v2f64 immAllZerosV), (V_SET0)>; +} // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, @@ -147,6 +154,14 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllZerosV))]>; } +let Predicates = [NoAVX512] in { +def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; +def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; +def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; +def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; +def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; +} + // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -355,7 +370,7 @@ defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle, SchedWriteFMoveLS.YMM>, PS, VEX, VEX_L, VEX_WIG; -defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", +defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SchedWriteFMoveLS.YMM>, PD, VEX, VEX_L, VEX_WIG; } @@ -661,7 +676,7 @@ let Predicates = [UseSSE1] in { // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1, + def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, (i8 -28)), (MOVLPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), @@ -727,7 +742,7 @@ let Predicates = [UseSSE1] in { // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))), + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; @@ -761,7 +776,7 @@ let Predicates = [UseSSE2] in { let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { // Use MOVLPD to load into the low bits from a full vector unless we can use // BLENDPD. - def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))), + def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; } @@ -1713,12 +1728,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>, Sched<[sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))]>, + (ld_frag addr:$src2), timm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1751,13 +1766,13 @@ multiclass sse12_cmp_scalar_int<Operand memop, def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, imm:$cc))]>, + VR128:$src, timm:$cc))]>, Sched<[sched]>; let mayLoad = 1 in def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - mem_cpat:$src, imm:$cc))]>, + mem_cpat:$src, timm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1876,12 +1891,12 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, - [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, + [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, Sched<[sched]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, - (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, + (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1906,7 +1921,7 @@ let Constraints = "$src1 = $dst" in { SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; } -def CommutableCMPCC : PatLeaf<(imm), [{ +def CommutableCMPCC : PatLeaf<(timm), [{ uint64_t Imm = N->getZExtValue() & 0x7; return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); }]>; @@ -1915,47 +1930,47 @@ def CommutableCMPCC : PatLeaf<(imm), [{ let Predicates = [HasAVX] in { def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, CommutableCMPCC:$cc)), - (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; + (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, CommutableCMPCC:$cc)), - (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; + (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, CommutableCMPCC:$cc)), - (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; + (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, CommutableCMPCC:$cc)), - (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; + (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; } let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, CommutableCMPCC:$cc)), - (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; + (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; } let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, CommutableCMPCC:$cc)), - (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; + (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; } //===----------------------------------------------------------------------===// @@ -1970,13 +1985,13 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), - (i8 imm:$src3))))], d>, + (i8 timm:$src3))))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCommutable = IsCommutable in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], d>, + (i8 timm:$src3))))], d>, Sched<[sched]>; } @@ -2097,7 +2112,7 @@ let Predicates = [HasAVX1Only] in { let Predicates = [UseSSE2] in { // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. def : Pat<(v2f64 (X86Unpckl VR128:$src1, - (v2f64 (nonvolatile_load addr:$src2)))), + (v2f64 (simple_load addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; } @@ -2721,7 +2736,7 @@ defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; - + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -3482,7 +3497,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, Sched<[schedImm]>; } @@ -3514,7 +3529,7 @@ multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>, + [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, Sched<[sched]>; } @@ -3597,7 +3612,7 @@ let Predicates = [HasAVX, prd] in { !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, + (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, VEX, Sched<[sched.XMM]>, VEX_WIG; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), @@ -3605,7 +3620,7 @@ let Predicates = [HasAVX, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (load addr:$src1), - (i8 imm:$src2))))]>, VEX, + (i8 timm:$src2))))]>, VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; } @@ -3615,7 +3630,7 @@ let Predicates = [HasAVX2, prd] in { !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>, + (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, u8imm:$src2), @@ -3623,7 +3638,7 @@ let Predicates = [HasAVX2, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode (load addr:$src1), - (i8 imm:$src2))))]>, VEX, VEX_L, + (i8 timm:$src2))))]>, VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; } @@ -3633,7 +3648,7 @@ let Predicates = [UseSSE2] in { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, + (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, Sched<[sched.XMM]>; def mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), @@ -3641,7 +3656,7 @@ let Predicates = [UseSSE2] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (memop addr:$src1), - (i8 imm:$src2))))]>, + (i8 timm:$src2))))]>, Sched<[sched.XMM.Folded]>; } } @@ -4380,7 +4395,7 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), + def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; @@ -4388,7 +4403,7 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [UseSSE3] in { // No need for aligned memory as this only loads 64-bits. - def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), + def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), (MOVDDUPrm addr:$src)>; def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (MOVDDUPrm addr:$src)>; @@ -4812,7 +4827,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>, + [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, Sched<[sched]>; let mayLoad = 1 in def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), @@ -4823,7 +4838,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, (VT (X86PAlignr RC:$src1, (memop_frag addr:$src2), - (i8 imm:$src3))))]>, + (i8 timm:$src3))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5300,7 +5315,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>, + (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, Sched<[SchedWriteFShuffle.XMM]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2, u8imm:$src3), @@ -5311,7 +5326,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { [(set VR128:$dst, (X86insertps VR128:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))]>, + timm:$src3))]>, Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; } @@ -5323,17 +5338,6 @@ let ExeDomain = SSEPackedSingle in { defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; } -let Predicates = [UseAVX] in { - // If we're inserting an element from a vbroadcast of a load, fold the - // load into the X86insertps instruction. - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), - (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), - (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), - (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), - (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; -} - //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// @@ -5348,7 +5352,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>, + [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, Sched<[sched]>; // Vector intrinsic operation, mem @@ -5357,13 +5361,13 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>, + (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, Sched<[sched.Folded]>; } multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { -let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, @@ -5378,7 +5382,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 -let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { +let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, @@ -5396,7 +5400,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { -let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -5411,7 +5415,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 -let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { +let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -5431,7 +5435,7 @@ multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched, ValueType VT32, ValueType VT64, SDNode OpNode, bit Is2Addr = 1> { -let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedSingle in { def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -5439,7 +5443,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, + [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, Sched<[sched]>; def SSm_Int : SS4AIi8<opcss, MRMSrcMem, @@ -5450,11 +5454,11 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 -let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedDouble in { def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -5462,7 +5466,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, + [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, Sched<[sched]>; def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, @@ -5473,7 +5477,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 } @@ -5508,17 +5512,17 @@ let Predicates = [UseAVX] in { } let Predicates = [UseAVX] in { - def : Pat<(X86VRndScale FR32:$src1, imm:$src2), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>; - def : Pat<(X86VRndScale FR64:$src1, imm:$src2), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR32:$src1, timm:$src2), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, timm:$src2), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; } let Predicates = [UseAVX, OptForSize] in { - def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; - def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; } let ExeDomain = SSEPackedSingle in @@ -5535,17 +5539,17 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { - def : Pat<(X86VRndScale FR32:$src1, imm:$src2), - (ROUNDSSr FR32:$src1, imm:$src2)>; - def : Pat<(X86VRndScale FR64:$src1, imm:$src2), - (ROUNDSDr FR64:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR32:$src1, timm:$src2), + (ROUNDSSr FR32:$src1, timm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, timm:$src2), + (ROUNDSDr FR64:$src1, timm:$src2)>; } let Predicates = [UseSSE41, OptForSize] in { - def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), - (ROUNDSSm addr:$src1, imm:$src2)>; - def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), - (ROUNDSDm addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2), + (ROUNDSSm addr:$src1, timm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2), + (ROUNDSDm addr:$src1, timm:$src2)>; } //===----------------------------------------------------------------------===// @@ -5826,7 +5830,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, Sched<[sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), @@ -5836,7 +5840,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>, + (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5853,7 +5857,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, Sched<[sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), @@ -5863,27 +5867,27 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -def BlendCommuteImm2 : SDNodeXForm<imm, [{ +def BlendCommuteImm2 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue() & 0x03; return getI8Imm(Imm ^ 0x03, SDLoc(N)); }]>; -def BlendCommuteImm4 : SDNodeXForm<imm, [{ +def BlendCommuteImm4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue() & 0x0f; return getI8Imm(Imm ^ 0x0f, SDLoc(N)); }]>; -def BlendCommuteImm8 : SDNodeXForm<imm, [{ +def BlendCommuteImm8 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue() & 0xff; return getI8Imm(Imm ^ 0xff, SDLoc(N)); }]>; // Turn a 4-bit blendi immediate to 8-bit for use with pblendw. -def BlendScaleImm4 : SDNodeXForm<imm, [{ +def BlendScaleImm4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 4; ++i) { @@ -5894,7 +5898,7 @@ def BlendScaleImm4 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 8-bit for use with pblendw. -def BlendScaleImm2 : SDNodeXForm<imm, [{ +def BlendScaleImm2 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -5905,7 +5909,7 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 4-bit for use with pblendd. -def BlendScaleImm2to4 : SDNodeXForm<imm, [{ +def BlendScaleImm2to4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -5916,7 +5920,7 @@ def BlendScaleImm2to4 : SDNodeXForm<imm, [{ }]>; // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. -def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{ +def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 4; ++i) { @@ -5927,7 +5931,7 @@ def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. -def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{ +def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -5938,7 +5942,7 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. -def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{ +def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -6008,7 +6012,7 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, Sched<[sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), @@ -6018,14 +6022,14 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, - (commuteXForm imm:$src3))>; + (commuteXForm timm:$src3))>; } let Predicates = [HasAVX] in { @@ -6061,37 +6065,37 @@ let Predicates = [HasAVX2] in { // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. // ExecutionDomainFixPass will cleanup domains later on. let Predicates = [HasAVX1Only] in { -def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), - (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), - (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), - (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), + (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; // Use pblendw for 128-bit integer to keep it in the integer domain and prevent // it from becoming movsd via commuting under optsize. -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; -def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), - (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), - (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), - (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; +def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), + (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; +def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; // Use pblendw for 128-bit integer to keep it in the integer domain and prevent // it from becoming movss via commuting under optsize. -def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), - (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; } defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, @@ -6107,19 +6111,19 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, let Predicates = [UseSSE41] in { // Use pblendw for 128-bit integer to keep it in the integer domain and prevent // it from becoming movss via commuting under optsize. -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; -def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), - (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; } // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -6592,7 +6596,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, - (i8 imm:$src3)))]>, TA, + (i8 timm:$src3)))]>, TA, Sched<[SchedWriteVecIMul.XMM]>; def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), @@ -6600,7 +6604,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, (memop addr:$src2), - (i8 imm:$src3)))]>, TA, + (i8 timm:$src3)))]>, TA, Sched<[SchedWriteVecIMul.XMM.Folded, SchedWriteVecIMul.XMM.ReadAfterFold]>; @@ -6718,26 +6722,26 @@ let Predicates = [HasAVX, HasAES] in { (ins VR128:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, Sched<[WriteAESKeyGen]>; def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>; //===----------------------------------------------------------------------===// @@ -6745,7 +6749,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), //===----------------------------------------------------------------------===// // Immediate transform to help with commuting. -def PCLMULCommuteImm : SDNodeXForm<imm, [{ +def PCLMULCommuteImm : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); }]>; @@ -6758,7 +6762,7 @@ let Predicates = [NoAVX, HasPCLMUL] in { (ins VR128:$src1, VR128:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, Sched<[WriteCLMul]>; def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), @@ -6766,14 +6770,14 @@ let Predicates = [NoAVX, HasPCLMUL] in { "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), - imm:$src3))]>, + timm:$src3))]>, Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; } // Constraints = "$src1 = $dst" def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, - (i8 imm:$src3)), + (i8 timm:$src3)), (PCLMULQDQrm VR128:$src1, addr:$src2, - (PCLMULCommuteImm imm:$src3))>; + (PCLMULCommuteImm timm:$src3))>; } // Predicates = [NoAVX, HasPCLMUL] // SSE aliases @@ -6795,21 +6799,21 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, (ins RC:$src1, RC:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set RC:$dst, - (IntId RC:$src1, RC:$src2, imm:$src3))]>, + (IntId RC:$src1, RC:$src2, timm:$src3))]>, Sched<[WriteCLMul]>; def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, MemOp:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set RC:$dst, - (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, + (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; // We can commute a load in the first operand by swapping the sources and // rotating the immediate. - def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), + def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, - (PCLMULCommuteImm imm:$src3))>; + (PCLMULCommuteImm timm:$src3))>; } let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in @@ -6853,8 +6857,8 @@ let Constraints = "$src = $dst" in { def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", - [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, - imm:$idx))]>, + [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, + timm:$idx))]>, PD, Sched<[SchedWriteVecALU.XMM]>; def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), @@ -6867,7 +6871,7 @@ def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, - imm:$len, imm:$idx))]>, + timm:$len, timm:$idx))]>, XD, Sched<[SchedWriteVecALU.XMM]>; def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), @@ -6907,10 +6911,10 @@ def : Pat<(nontemporalstore FR64:$src, addr:$dst), // class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, - PatFrag ld_frag, SchedWrite Sched> : + PatFrag bcast_frag, SchedWrite Sched> : AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, Sched<[Sched]>, VEX; // AVX2 adds register forms @@ -6923,15 +6927,15 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, - f32mem, v4f32, loadf32, + f32mem, v4f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>; def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, - f32mem, v8f32, loadf32, + f32mem, v8f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>, VEX_L; } let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, - v4f64, loadf64, + v4f64, X86VBroadcastld64, SchedWriteFShuffle.XMM.Folded>, VEX_L; let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { @@ -6944,15 +6948,6 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, v4f64, v2f64, WriteFShuffle256>, VEX_L; -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (VBROADCASTSSrm addr:$src)>; - def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (VBROADCASTSSYrm addr:$src)>; - def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (VBROADCASTSDYrm addr:$src)>; -} - //===----------------------------------------------------------------------===// // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both // halves of a 256-bit vector. @@ -7081,27 +7076,29 @@ let Predicates = [HasAVX1Only] in { // multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, Intrinsic IntLd, Intrinsic IntLd256, - Intrinsic IntSt, Intrinsic IntSt256> { + Intrinsic IntSt, Intrinsic IntSt256, + X86SchedWriteMaskMove schedX, + X86SchedWriteMaskMove schedY> { def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, - VEX_4V, Sched<[WriteFMaskedLoad]>; + VEX_4V, Sched<[schedX.RM]>; def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, - VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; + VEX_4V, VEX_L, Sched<[schedY.RM]>; def mr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, - VEX_4V, Sched<[WriteFMaskedStore]>; + VEX_4V, Sched<[schedX.MR]>; def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, - VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; + VEX_4V, VEX_L, Sched<[schedY.MR]>; } let ExeDomain = SSEPackedSingle in @@ -7109,13 +7106,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", int_x86_avx_maskload_ps, int_x86_avx_maskload_ps_256, int_x86_avx_maskstore_ps, - int_x86_avx_maskstore_ps_256>; + int_x86_avx_maskstore_ps_256, + WriteFMaskMove32, WriteFMaskMove32Y>; let ExeDomain = SSEPackedDouble in defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", int_x86_avx_maskload_pd, int_x86_avx_maskload_pd_256, int_x86_avx_maskstore_pd, - int_x86_avx_maskstore_pd_256>; + int_x86_avx_maskstore_pd_256, + WriteFMaskMove64, WriteFMaskMove64Y>; //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values @@ -7143,13 +7142,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, + [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, Sched<[sched]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, + (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, Sched<[sched.Folded]>; }// Predicates = [HasAVX, NoVLX] } @@ -7181,38 +7180,38 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, VEX_4V, VEX_L, + (i8 timm:$src3))))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), - (i8 imm:$src3)))]>, VEX_4V, VEX_L, + (i8 timm:$src3)))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; } // Immediate transform to help with commuting. -def Perm2XCommuteImm : SDNodeXForm<imm, [{ +def Perm2XCommuteImm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); }]>; let Predicates = [HasAVX] in { // Pattern with load in other operand. def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), - VR256:$src1, (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + VR256:$src1, (i8 timm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, - (loadv4i64 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; + (loadv4i64 addr:$src2), (i8 timm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>; // Pattern with load in other operand. def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), - VR256:$src1, (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + VR256:$src1, (i8 timm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; } //===----------------------------------------------------------------------===// @@ -7257,7 +7256,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, + [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>, TAPD, VEX, Sched<[RR]>; let hasSideEffects = 0, mayStore = 1 in def mr : Ii8<0x1D, MRMDestMem, (outs), @@ -7282,15 +7281,15 @@ let Predicates = [HasF16C, NoVLX] in { (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), - (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; + (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -7327,20 +7326,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, Sched<[sched]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, - (commuteXForm imm:$src3))>; + (commuteXForm timm:$src3))>; } let Predicates = [HasAVX2] in { @@ -7351,19 +7350,19 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, SchedWriteBlend.YMM, VR256, i256mem, BlendCommuteImm8>, VEX_L; -def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), - (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), - (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), - (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), + (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), - (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>; -def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), - (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), + (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; } // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -7407,7 +7406,7 @@ def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0 // destination operand // multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, + X86MemOperand x86memop, PatFrag bcast_frag, ValueType OpVT128, ValueType OpVT256, Predicate prd> { let Predicates = [HasAVX2, prd] in { def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -7418,7 +7417,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, + (OpVT128 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -7428,7 +7427,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, + (OpVT256 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; // Provide aliases for broadcast from the same register class that @@ -7439,13 +7438,13 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, } } -defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, v16i8, v32i8, NoVLX_Or_NoBWI>; -defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, v8i16, v16i16, NoVLX_Or_NoBWI>; -defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, v4i32, v8i32, NoVLX>; -defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2, NoVLX] in { @@ -7455,14 +7454,11 @@ let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + // FIXME this is to handle aligned extloads from i8/i16. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (VPBROADCASTQYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -7483,17 +7479,12 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; -} -let Predicates = [HasAVX2, NoVLX] in { - // Provide aliases for broadcast from the same register class that - // automatically does the extract. - def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), - (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), - sub_xmm)))>; - def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), - (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), - sub_xmm)))>; + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX] in { @@ -7509,45 +7500,41 @@ let Predicates = [HasAVX2, NoVLX] in { let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS + (VPBROADCASTBrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit)), - VR128)))>; + GR8:$src, sub_8bit))))>; def : Pat<(v32i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS + (VPBROADCASTBYrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit)), - VR128)))>; + GR8:$src, sub_8bit))))>; def : Pat<(v8i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS + (VPBROADCASTWrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit)), - VR128)))>; + GR16:$src, sub_16bit))))>; def : Pat<(v16i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS + (VPBROADCASTWYrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit)), - VR128)))>; + GR16:$src, sub_16bit))))>; } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; + (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), - (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; + (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; def : Pat<(v2i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; + (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; + (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; } // AVX1 broadcast patterns let Predicates = [HasAVX1Only] in { -def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), +def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSrm addr:$src)>; } @@ -7557,12 +7544,12 @@ let Predicates = [HasAVX, NoVLX] in { // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; - def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), (VMOVDDUPrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), (VMOVDDUPrm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPrm addr:$src)>; @@ -7581,19 +7568,19 @@ let Predicates = [HasAVX1Only] in { (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; + (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; + (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), + (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; + (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), + (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; def : Pat<(v2i64 (X86VBroadcast i64:$src)), - (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; + def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; } @@ -7636,7 +7623,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, + (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins memOp:$src1, u8imm:$src2), @@ -7644,7 +7631,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), - (i8 imm:$src2))))]>, + (i8 timm:$src2))))]>, Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; } } @@ -7663,19 +7650,19 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, + (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), - (i8 imm:$src3)))]>, + (i8 timm:$src3)))]>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; let Predicates = [HasAVX2] in def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), - VR256:$src1, (i8 imm:$imm))), - (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + VR256:$src1, (i8 timm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; //===----------------------------------------------------------------------===// @@ -7760,7 +7747,7 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q_256>, VEX_W; multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, - ValueType MaskVT, string BlendStr, ValueType ZeroVT> { + ValueType MaskVT> { // masked store def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; @@ -7772,23 +7759,23 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; } let Predicates = [HasAVX] in { - defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; - defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; - defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; - defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; } let Predicates = [HasAVX1Only] in { // load/store i32/i64 not supported use ps/pd version - defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; - defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; - defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; - defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; } let Predicates = [HasAVX2] in { - defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; - defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; - defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; - defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; + defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; + defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; + defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; + defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; } //===----------------------------------------------------------------------===// @@ -7956,13 +7943,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), "", - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", [(set RC:$dst, (OpVT (OpNode RC:$src1, (MemOpFrag addr:$src2), - imm:$src3)))], SSEPackedInt>, + timm:$src3)))], SSEPackedInt>, Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; } } diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 7050e1917494..7f41feb6c0d9 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -43,7 +43,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; let SchedRW = [WriteSystem] in { def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", - [(int_x86_int imm:$trap)]>; + [(int_x86_int timm:$trap)]>; def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index fc0da845299f..3a1212342a13 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -45,7 +45,7 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins), def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", - [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; + [(int_x86_xabort timm:$imm)]>, Requires<[HasRTM]>; } // SchedRW // HLE prefixes diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 66ca78556b82..229af366d940 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -143,13 +143,13 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, + (vt128 (OpNode (vt128 VR128:$src1), timm:$src2)))]>, XOP, Sched<[sched]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>, + (vt128 (OpNode (vt128 (load addr:$src1)), timm:$src2)))]>, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -251,7 +251,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), - imm:$cc)))]>, + timm:$cc)))]>, XOP_4V, Sched<[sched]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$cc), @@ -260,14 +260,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128, [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)), - imm:$cc)))]>, + timm:$cc)))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } def : Pat<(OpNode (load addr:$src2), - (vt128 VR128:$src1), imm:$cc), + (vt128 VR128:$src1), timm:$cc), (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2, - (CommuteVPCOMCC imm:$cc))>; + (CommuteVPCOMCC timm:$cc))>; } defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>; @@ -418,27 +418,27 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag, X86FoldableSchedWrite sched> { def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), + (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, - (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>, + (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 timm:$src4))))]>, Sched<[sched]>; def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst), - (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4), + (ins RC:$src1, RC:$src2, intmemop:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3), - (i8 imm:$src4))))]>, VEX_W, + (i8 timm:$src4))))]>, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4), + (ins RC:$src1, fpmemop:$src2, RC:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2), - RC:$src3, (i8 imm:$src4))))]>, + RC:$src3, (i8 timm:$src4))))]>, Sched<[sched.Folded, sched.ReadAfterFold, // fpmemop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -447,7 +447,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), + (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 892a083f4d1a..01620b7b64c9 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -60,7 +60,7 @@ public: X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI, const X86RegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } private: @@ -94,11 +94,9 @@ private: MachineFunction &MF) const; bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const; + MachineFunction &MF); bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const; + MachineFunction &MF); bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI, @@ -217,7 +215,7 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) { } static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) { - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); if (X86::GR64RegClass.contains(Reg)) return &X86::GR64RegClass; if (X86::GR32RegClass.contains(Reg)) @@ -233,15 +231,15 @@ static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) { // Set X86 Opcode and constrain DestReg. bool X86InstructionSelector::selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const { - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); - unsigned SrcReg = I.getOperand(1).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { + if (Register::isPhysicalRegister(DstReg)) { assert(I.isCopy() && "Generic operators do not allow physical registers"); if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID && @@ -253,7 +251,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, if (SrcRC != DstRC) { // This case can be generated by ABI lowering, performe anyext - unsigned ExtSrc = MRI.createVirtualRegister(DstRC); + Register ExtSrc = MRI.createVirtualRegister(DstRC); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG)) .addDef(ExtSrc) @@ -268,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, return true; } - assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && + assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) && "No phys reg on generic operators"); assert((DstSize == SrcSize || // Copies are a mean to setup initial types, the number of // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + (Register::isPhysicalRegister(SrcReg) && DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) && "Copy with different width?!"); @@ -282,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, if (SrcRegBank.getID() == X86::GPRRegBankID && DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize && - TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + Register::isPhysicalRegister(SrcReg)) { // Change the physical register to performe truncate. const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg); @@ -308,8 +306,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, return true; } -bool X86InstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool X86InstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -333,7 +330,7 @@ bool X86InstructionSelector::select(MachineInstr &I, assert(I.getNumOperands() == I.getNumExplicitOperands() && "Generic instruction has unexpected implicit operands\n"); - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs())); @@ -370,10 +367,10 @@ bool X86InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_UADDE: return selectUadde(I, MRI, MF); case TargetOpcode::G_UNMERGE_VALUES: - return selectUnmergeValues(I, MRI, MF, CoverageInfo); + return selectUnmergeValues(I, MRI, MF); case TargetOpcode::G_MERGE_VALUES: case TargetOpcode::G_CONCAT_VECTORS: - return selectMergeValues(I, MRI, MF, CoverageInfo); + return selectMergeValues(I, MRI, MF); case TargetOpcode::G_EXTRACT: return selectExtract(I, MRI, MF); case TargetOpcode::G_INSERT: @@ -512,7 +509,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) && "unexpected instruction"); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); @@ -572,7 +569,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) && "unexpected instruction"); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); // Use LEA to calculate frame index and GEP @@ -625,7 +622,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, AM.Base.Reg = X86::RIP; } - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); unsigned NewOpc = getLeaOP(Ty, STI); @@ -644,7 +641,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I, assert((I.getOpcode() == TargetOpcode::G_CONSTANT) && "unexpected instruction"); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID) @@ -717,8 +714,8 @@ bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I, I.getOpcode() == TargetOpcode::G_PTRTOINT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); @@ -781,8 +778,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); @@ -892,8 +889,8 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); @@ -952,8 +949,8 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, std::tie(CC, SwapArgs) = X86::getX86ConditionCode( (CmpInst::Predicate)I.getOperand(1).getPredicate()); - unsigned LHS = I.getOperand(2).getReg(); - unsigned RHS = I.getOperand(3).getReg(); + Register LHS = I.getOperand(2).getReg(); + Register RHS = I.getOperand(3).getReg(); if (SwapArgs) std::swap(LHS, RHS); @@ -998,8 +995,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_FCMP) && "unexpected instruction"); - unsigned LhsReg = I.getOperand(2).getReg(); - unsigned RhsReg = I.getOperand(3).getReg(); + Register LhsReg = I.getOperand(2).getReg(); + Register RhsReg = I.getOperand(3).getReg(); CmpInst::Predicate Predicate = (CmpInst::Predicate)I.getOperand(1).getPredicate(); @@ -1033,7 +1030,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, break; } - unsigned ResultReg = I.getOperand(0).getReg(); + Register ResultReg = I.getOperand(0).getReg(); RBI.constrainGenericRegister( ResultReg, *getRegClass(LLT::scalar(8), *RBI.getRegBank(ResultReg, MRI, TRI)), MRI); @@ -1043,8 +1040,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, .addReg(LhsReg) .addReg(RhsReg); - unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass); - unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass); + Register FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass); + Register FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass); MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]); MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), @@ -1089,11 +1086,11 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned CarryOutReg = I.getOperand(1).getReg(); - const unsigned Op0Reg = I.getOperand(2).getReg(); - const unsigned Op1Reg = I.getOperand(3).getReg(); - unsigned CarryInReg = I.getOperand(4).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register CarryOutReg = I.getOperand(1).getReg(); + const Register Op0Reg = I.getOperand(2).getReg(); + const Register Op1Reg = I.getOperand(3).getReg(); + Register CarryInReg = I.getOperand(4).getReg(); const LLT DstTy = MRI.getType(DstReg); @@ -1149,8 +1146,8 @@ bool X86InstructionSelector::selectExtract(MachineInstr &I, assert((I.getOpcode() == TargetOpcode::G_EXTRACT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); int64_t Index = I.getOperand(2).getImm(); const LLT DstTy = MRI.getType(DstReg); @@ -1281,9 +1278,9 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); - const unsigned InsertReg = I.getOperand(2).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + const Register InsertReg = I.getOperand(2).getReg(); int64_t Index = I.getOperand(3).getImm(); const LLT DstTy = MRI.getType(DstReg); @@ -1335,14 +1332,13 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, } bool X86InstructionSelector::selectUnmergeValues( - MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const { + MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) { assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) && "unexpected instruction"); // Split to extracts. unsigned NumDefs = I.getNumOperands() - 1; - unsigned SrcReg = I.getOperand(NumDefs).getReg(); + Register SrcReg = I.getOperand(NumDefs).getReg(); unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); for (unsigned Idx = 0; Idx < NumDefs; ++Idx) { @@ -1352,7 +1348,7 @@ bool X86InstructionSelector::selectUnmergeValues( .addReg(SrcReg) .addImm(Idx * DefSize); - if (!select(ExtrInst, CoverageInfo)) + if (!select(ExtrInst)) return false; } @@ -1361,15 +1357,14 @@ bool X86InstructionSelector::selectUnmergeValues( } bool X86InstructionSelector::selectMergeValues( - MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const { + MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) { assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES || I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) && "unexpected instruction"); // Split to inserts. - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg0 = I.getOperand(1).getReg(); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg0 = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg0); @@ -1378,13 +1373,13 @@ bool X86InstructionSelector::selectMergeValues( const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); // For the first src use insertSubReg. - unsigned DefReg = MRI.createGenericVirtualRegister(DstTy); + Register DefReg = MRI.createGenericVirtualRegister(DstTy); MRI.setRegBank(DefReg, RegBank); if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF)) return false; for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) { - unsigned Tmp = MRI.createGenericVirtualRegister(DstTy); + Register Tmp = MRI.createGenericVirtualRegister(DstTy); MRI.setRegBank(Tmp, RegBank); MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), @@ -1395,7 +1390,7 @@ bool X86InstructionSelector::selectMergeValues( DefReg = Tmp; - if (!select(InsertInst, CoverageInfo)) + if (!select(InsertInst)) return false; } @@ -1403,7 +1398,7 @@ bool X86InstructionSelector::selectMergeValues( TII.get(TargetOpcode::COPY), DstReg) .addReg(DefReg); - if (!select(CopyInst, CoverageInfo)) + if (!select(CopyInst)) return false; I.eraseFromParent(); @@ -1415,7 +1410,7 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction"); - const unsigned CondReg = I.getOperand(0).getReg(); + const Register CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); MachineInstr &TestInst = @@ -1442,7 +1437,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, if (CM != CodeModel::Small && CM != CodeModel::Large) return false; - const unsigned DstReg = I.getOperand(0).getReg(); + const Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); unsigned Align = DstTy.getSizeInBits(); @@ -1460,7 +1455,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, // Under X86-64 non-small code model, GV (and friends) are 64-bits, so // they cannot be folded into immediate fields. - unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass); + Register AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass); BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg) .addConstantPoolIndex(CPI, 0, OpFlag); @@ -1503,7 +1498,7 @@ bool X86InstructionSelector::selectImplicitDefOrPHI( I.getOpcode() == TargetOpcode::G_PHI) && "unexpected instruction"); - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); if (!MRI.getRegClassOrNull(DstReg)) { const LLT DstTy = MRI.getType(DstReg); @@ -1537,7 +1532,7 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, I.getOpcode() == TargetOpcode::G_LSHR) && "unexpected instruction"); - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); @@ -1578,8 +1573,8 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, return false; } - unsigned Op0Reg = I.getOperand(1).getReg(); - unsigned Op1Reg = I.getOperand(2).getReg(); + Register Op0Reg = I.getOperand(1).getReg(); + Register Op1Reg = I.getOperand(2).getReg(); assert(MRI.getType(Op1Reg).getSizeInBits() == 8); @@ -1606,9 +1601,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, I.getOpcode() == TargetOpcode::G_UREM) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned Op1Reg = I.getOperand(1).getReg(); - const unsigned Op2Reg = I.getOperand(2).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register Op1Reg = I.getOperand(1).getReg(); + const Register Op2Reg = I.getOperand(2).getReg(); const LLT RegTy = MRI.getType(DstReg); assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) && @@ -1732,7 +1727,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpSignExtend)); else { - unsigned Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass); + Register Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::MOV32r0), Zero32); @@ -1770,8 +1765,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, if ((I.getOpcode() == Instruction::SRem || I.getOpcode() == Instruction::URem) && OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) { - unsigned SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); - unsigned ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); + Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); + Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg) .addReg(X86::AX); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 40141d894629..1d7adbaa9e99 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, - CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI, CVTPD2PS_MASK, INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE, INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE, @@ -1101,8 +1101,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), - X86_INTRINSIC_DATA(tbm_bextri_u32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), - X86_INTRINSIC_DATA(tbm_bextri_u64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), + X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0), + X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 00fb1b573858..04121f863c89 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -13,6 +13,7 @@ #include "X86LegalizerInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" @@ -84,6 +85,24 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, verify(*STI.getInstrInfo()); } +bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, MRI, MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + default: + break; + } + return true; +} + void X86LegalizerInfo::setLegalizerInfo32bit() { const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0)); @@ -158,6 +177,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_ANYEXT, Ty}, Legal); } setAction({G_ANYEXT, s128}, Legal); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); // Comparison setAction({G_ICMP, s1}, Legal); diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h index d21707b9ab9b..7a0f13fb5ae6 100644 --- a/lib/Target/X86/X86LegalizerInfo.h +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -32,6 +32,9 @@ private: public: X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM); + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + private: void setLegalizerInfo32bit(); void setLegalizerInfo64bit(); diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index b1fefaa84be4..78098fd6262f 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -427,6 +427,41 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, } } +// Replace TAILJMP opcodes with their equivalent opcodes that have encoding +// information. +static unsigned convertTailJumpOpcode(unsigned Opcode) { + switch (Opcode) { + case X86::TAILJMPr: + Opcode = X86::JMP32r; + break; + case X86::TAILJMPm: + Opcode = X86::JMP32m; + break; + case X86::TAILJMPr64: + Opcode = X86::JMP64r; + break; + case X86::TAILJMPm64: + Opcode = X86::JMP64m; + break; + case X86::TAILJMPr64_REX: + Opcode = X86::JMP64r_REX; + break; + case X86::TAILJMPm64_REX: + Opcode = X86::JMP64m_REX; + break; + case X86::TAILJMPd: + case X86::TAILJMPd64: + Opcode = X86::JMP_1; + break; + case X86::TAILJMPd_CC: + case X86::TAILJMPd64_CC: + Opcode = X86::JCC_1; + break; + } + + return Opcode; +} + void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); @@ -500,21 +535,190 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; } - // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register - // inputs modeled as normal uses instead of implicit uses. As such, truncate - // off all but the first operand (the callee). FIXME: Change isel. - case X86::TAILJMPr64: - case X86::TAILJMPr64_REX: - case X86::CALL64r: - case X86::CALL64pcrel32: { - unsigned Opcode = OutMI.getOpcode(); - MCOperand Saved = OutMI.getOperand(0); - OutMI = MCInst(); - OutMI.setOpcode(Opcode); - OutMI.addOperand(Saved); + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik: + case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik: + case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmi: case X86::VPCMPBZrmik: + case X86::VPCMPBZrri: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmi: case X86::VPCMPDZrmik: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPDZrri: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmi: case X86::VPCMPQZrmik: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPQZrri: case X86::VPCMPQZrrik: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik: + case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik: + case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmi: case X86::VPCMPWZrmik: + case X86::VPCMPWZrri: case X86::VPCMPWZrrik: { + // Turn immediate 0 into the VPCMPEQ instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + + // Turn immediate 6 into the VPCMPGT instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + break; } + // CALL64r, CALL64pcrel32 - These instructions used to have + // register inputs modeled as normal uses instead of implicit uses. As such, + // they we used to truncate off all but the first operand (the callee). This + // issue seems to have been fixed at some point. This assert verifies that. + case X86::CALL64r: + case X86::CALL64pcrel32: + assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!"); + break; + case X86::EH_RETURN: case X86::EH_RETURN64: { OutMI = MCInst(); @@ -539,36 +743,30 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; } - // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump - // instruction. - { - unsigned Opcode; - case X86::TAILJMPr: - Opcode = X86::JMP32r; - goto SetTailJmpOpcode; - case X86::TAILJMPd: - case X86::TAILJMPd64: - Opcode = X86::JMP_1; - goto SetTailJmpOpcode; - - SetTailJmpOpcode: - MCOperand Saved = OutMI.getOperand(0); - OutMI = MCInst(); - OutMI.setOpcode(Opcode); - OutMI.addOperand(Saved); - break; - } + // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump + // instruction. + case X86::TAILJMPr: + case X86::TAILJMPr64: + case X86::TAILJMPr64_REX: + case X86::TAILJMPd: + case X86::TAILJMPd64: + assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!"); + OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode())); + break; case X86::TAILJMPd_CC: - case X86::TAILJMPd64_CC: { - MCOperand Saved = OutMI.getOperand(0); - MCOperand Saved2 = OutMI.getOperand(1); - OutMI = MCInst(); - OutMI.setOpcode(X86::JCC_1); - OutMI.addOperand(Saved); - OutMI.addOperand(Saved2); + case X86::TAILJMPd64_CC: + assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!"); + OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode())); + break; + + case X86::TAILJMPm: + case X86::TAILJMPm64: + case X86::TAILJMPm64_REX: + assert(OutMI.getNumOperands() == X86::AddrNumOperands && + "Unexpected number of operands!"); + OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode())); break; - } case X86::DEC16r: case X86::DEC32r: @@ -958,7 +1156,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>, // <opcode>, <operands> - unsigned DefRegister = FaultingMI.getOperand(0).getReg(); + Register DefRegister = FaultingMI.getOperand(0).getReg(); FaultMaps::FaultKind FK = static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm()); MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol(); @@ -1079,7 +1277,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, // Emit MOV to materialize the target address and the CALL to target. // This is encoded with 12-13 bytes, depending on which register is used. - unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg(); + Register ScratchReg = MI.getOperand(ScratchIdx).getReg(); if (X86II::isX86_64ExtendedReg(ScratchReg)) EncodedBytes = 13; else @@ -1369,6 +1567,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, recordSled(CurSled, MI, SledKind::TAIL_CALL); unsigned OpCode = MI.getOperand(0).getImm(); + OpCode = convertTailJumpOpcode(OpCode); MCInst TC; TC.setOpcode(OpCode); @@ -1538,8 +1737,6 @@ static void printConstant(const Constant *COp, raw_ostream &CS) { void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only"); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); // Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86. if (EmitFPOData) { @@ -1577,17 +1774,16 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { // Otherwise, use the .seh_ directives for all other Windows platforms. switch (MI->getOpcode()) { case X86::SEH_PushReg: - OutStreamer->EmitWinCFIPushReg( - RI->getSEHRegNum(MI->getOperand(0).getImm())); + OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm()); break; case X86::SEH_SaveReg: - OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; case X86::SEH_SaveXMM: - OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; @@ -1596,9 +1792,8 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { break; case X86::SEH_SetFrame: - OutStreamer->EmitWinCFISetFrame( - RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); + OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(), + MI->getOperand(1).getImm()); break; case X86::SEH_PushFrame: @@ -1650,7 +1845,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::EH_RETURN: case X86::EH_RETURN64: { // Lower these as normal, but add some comments. - unsigned Reg = MI->getOperand(0).getReg(); + Register Reg = MI->getOperand(0).getReg(); OutStreamer->AddComment(StringRef("eh_return, addr: %") + X86ATTInstPrinter::getRegisterName(Reg)); break; @@ -1697,11 +1892,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::MASKPAIR16LOAD: { int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); - unsigned Reg = MI->getOperand(0).getReg(); - unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); - unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + Register Reg = MI->getOperand(0).getReg(); + Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); // Load the first mask register MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm); @@ -1730,11 +1923,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::MASKPAIR16STORE: { int64_t Disp = MI->getOperand(X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); - unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg(); - unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); - unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + Register Reg = MI->getOperand(X86::AddrNumOperands).getReg(); + Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); // Store the first mask register MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk); diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index d7e535598d81..5cb80a082b56 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// is stashed. signed char RestoreBasePointerOffset = 0; + /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame + /// in bytes. + DenseMap<int, unsigned> WinEHXMMSlotInfo; + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. unsigned CalleeSavedFrameSize = 0; @@ -120,6 +124,10 @@ public: void setRestoreBasePointer(const MachineFunction *MF); int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } + DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; } + const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const { + return WinEHXMMSlotInfo; } + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp index 7f75598b0655..1aee01563c4b 100644 --- a/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -198,8 +198,7 @@ static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) { static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2) { return MO1.isIdenticalTo(MO2) && - (!MO1.isReg() || - !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); + (!MO1.isReg() || !Register::isPhysicalRegister(MO1.getReg())); } #ifndef NDEBUG @@ -235,9 +234,9 @@ static inline bool isLEA(const MachineInstr &MI) { namespace { -class OptimizeLEAPass : public MachineFunctionPass { +class X86OptimizeLEAPass : public MachineFunctionPass { public: - OptimizeLEAPass() : MachineFunctionPass(ID) {} + X86OptimizeLEAPass() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "X86 LEA Optimize"; } @@ -246,6 +245,8 @@ public: /// been calculated by LEA. Also, remove redundant LEAs. bool runOnMachineFunction(MachineFunction &MF) override; + static char ID; + private: using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>; @@ -296,18 +297,18 @@ private: MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; - - static char ID; }; } // end anonymous namespace -char OptimizeLEAPass::ID = 0; +char X86OptimizeLEAPass::ID = 0; -FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } +FunctionPass *llvm::createX86OptimizeLEAs() { return new X86OptimizeLEAPass(); } +INITIALIZE_PASS(X86OptimizeLEAPass, DEBUG_TYPE, "X86 optimize LEA pass", false, + false) -int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, - const MachineInstr &Last) { +int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First, + const MachineInstr &Last) { // Both instructions must be in the same basic block and they must be // presented in InstrPos. assert(Last.getParent() == First.getParent() && @@ -328,10 +329,9 @@ int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, // 3) Displacement of the new memory operand should fit in 1 byte if possible. // 4) The LEA should be as close to MI as possible, and prior to it if // possible. -bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, - const MachineInstr &MI, - MachineInstr *&BestLEA, - int64_t &AddrDispShift, int &Dist) { +bool X86OptimizeLEAPass::chooseBestLEA( + const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI, + MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) { const MachineFunction *MF = MI.getParent()->getParent(); const MCInstrDesc &Desc = MI.getDesc(); int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) + @@ -387,9 +387,10 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, // Get the difference between the addresses' displacements of the two // instructions \p MI1 and \p MI2. The numbers of the first memory operands are // passed through \p N1 and \p N2. -int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1, - const MachineInstr &MI2, - unsigned N2) const { +int64_t X86OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, + unsigned N1, + const MachineInstr &MI2, + unsigned N2) const { const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp); const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp); @@ -411,9 +412,9 @@ int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1, // 2) Def registers of LEAs belong to the same class. // 3) All uses of the Last LEA def register are replaceable, thus the // register is used only as address base. -bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, - const MachineInstr &Last, - int64_t &AddrDispShift) const { +bool X86OptimizeLEAPass::isReplaceable(const MachineInstr &First, + const MachineInstr &Last, + int64_t &AddrDispShift) const { assert(isLEA(First) && isLEA(Last) && "The function works only with LEA instructions"); @@ -467,7 +468,8 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, return true; } -void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) { +void X86OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, + MemOpMap &LEAs) { unsigned Pos = 0; for (auto &MI : MBB) { // Assign the position number to the instruction. Note that we are going to @@ -485,7 +487,7 @@ void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) { // Try to find load and store instructions which recalculate addresses already // calculated by some LEA and replace their memory operands with its def // register. -bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { +bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { bool Changed = false; assert(!LEAs.empty()); @@ -564,9 +566,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { return Changed; } -MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, - unsigned VReg, - int64_t AddrDispShift) { +MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, + unsigned VReg, + int64_t AddrDispShift) { DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression()); if (AddrDispShift != 0) Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift); @@ -583,7 +585,7 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, } // Try to find similar LEAs in the list and replace one with another. -bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { +bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { bool Changed = false; // Loop over all entries in the table. @@ -613,8 +615,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { // Loop over all uses of the Last LEA and update their operands. Note // that the correctness of this has already been checked in the // isReplaceable function. - unsigned FirstVReg = First.getOperand(0).getReg(); - unsigned LastVReg = Last.getOperand(0).getReg(); + Register FirstVReg = First.getOperand(0).getReg(); + Register LastVReg = Last.getOperand(0).getReg(); for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end(); UI != UE;) { MachineOperand &MO = *UI++; @@ -670,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { return Changed; } -bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { +bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; if (DisableX86LEAOpt || skipFunction(MF.getFunction())) diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp index 78fede3dcde2..daddf4231897 100644 --- a/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -46,7 +46,9 @@ const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass( if (X86::GR8RegClass.hasSubClassEq(&RC) || X86::GR16RegClass.hasSubClassEq(&RC) || X86::GR32RegClass.hasSubClassEq(&RC) || - X86::GR64RegClass.hasSubClassEq(&RC)) + X86::GR64RegClass.hasSubClassEq(&RC) || + X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) || + X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC)) return getRegBank(X86::GPRRegBankID); if (X86::FR32XRegClass.hasSubClassEq(&RC) || diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 2e2f1f9e438a..ff625325b4c9 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -544,7 +544,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); + Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); @@ -677,13 +677,13 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { MI.getOperand(4).getImm() != 0 || MI.getOperand(5).getReg() != X86::NoRegister) return false; - unsigned BasePtr = MI.getOperand(1).getReg(); + Register BasePtr = MI.getOperand(1).getReg(); // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will // be replaced with a 32-bit operand MOV which will zero extend the upper // 32-bits of the super register. if (Opc == X86::LEA64_32r) BasePtr = getX86SubSuperRegister(BasePtr, 32); - unsigned NewDestReg = MI.getOperand(0).getReg(); + Register NewDestReg = MI.getOperand(0).getReg(); const X86InstrInfo *TII = MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo(); TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr, @@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { return true; } +static bool isFuncletReturnInstr(MachineInstr &MI) { + switch (MI.getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineInstr &MI = *II; - MachineFunction &MF = *MI.getParent()->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false + : isFuncletReturnInstr(*MBBI); const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); @@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && "Return instruction can only reference SP relative frame objects"); FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0); + } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) { + FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr); } else { FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr); } @@ -729,7 +746,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. // Don't change BasePtr since it is used later for stack adjustment. - unsigned MachineBasePtr = BasePtr; + Register MachineBasePtr = BasePtr; if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) MachineBasePtr = getX86SubSuperRegister(BasePtr, 64); @@ -773,7 +790,7 @@ Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { unsigned X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); - unsigned FrameReg = getFrameRegister(MF); + Register FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; @@ -782,7 +799,7 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { unsigned X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); - unsigned StackReg = getStackRegister(); + Register StackReg = getStackRegister(); if (Subtarget.isTarget64BitILP32()) StackReg = getX86SubSuperRegister(StackReg, 32); return StackReg; diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp index b435b22e8ac7..f8464c7e8298 100644 --- a/lib/Target/X86/X86RetpolineThunks.cpp +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -58,8 +58,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineModuleInfo>(); - AU.addPreserved<MachineModuleInfo>(); + AU.addRequired<MachineModuleInfoWrapperPass>(); + AU.addPreserved<MachineModuleInfoWrapperPass>(); } private: @@ -97,7 +97,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { TII = STI->getInstrInfo(); Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; - MMI = &getAnalysis<MachineModuleInfo>(); + MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); Module &M = const_cast<Module &>(*MMI->getModule()); // If this function is not a thunk, check to see if we need to insert @@ -279,7 +279,7 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF, CallTarget->addLiveIn(Reg); CallTarget->setHasAddressTaken(); - CallTarget->setAlignment(4); + CallTarget->setAlignment(Align(16)); insertRegReturnAddrClobber(*CallTarget, Reg); CallTarget->back().setPreInstrSymbol(MF, TargetSym); BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 7574e4b8f896..9b1fcaa8a13d 100755 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -232,8 +232,12 @@ defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; -defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; + +defm : X86WriteRes<WriteFMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; + defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 284d1567c5c6..06f417501b21 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -231,8 +231,12 @@ defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; -defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; + +defm : X86WriteRes<WriteFMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; + defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td index 41bd776648f7..76001d382a27 100644 --- a/lib/Target/X86/X86SchedPredicates.td +++ b/lib/Target/X86/X86SchedPredicates.td @@ -84,3 +84,60 @@ def IsSETAm_Or_SETBEm : CheckAny<[ CheckImmOperand_s<5, "X86::COND_A">, CheckImmOperand_s<5, "X86::COND_BE"> ]>; + +// A predicate used to check if an instruction has a LOCK prefix. +def CheckLockPrefix : CheckFunctionPredicate< + "X86_MC::hasLockPrefix", + "X86InstrInfo::hasLockPrefix" +>; + +def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>; + +def IsRegMemCompareAndSwap_8 : CheckOpcode<[ + LCMPXCHG8, CMPXCHG8rm +]>; + +def IsRegRegCompareAndSwap_16_32_64 : CheckOpcode<[ + CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr +]>; + +def IsRegMemCompareAndSwap_16_32_64 : CheckOpcode<[ + CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm, + LCMPXCHG16, LCMPXCHG32, LCMPXCHG64, + LCMPXCHG8B, LCMPXCHG16B +]>; + +def IsCompareAndSwap8B : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>; +def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>; + +def IsRegMemCompareAndSwap : CheckOpcode< + !listconcat( + IsRegMemCompareAndSwap_8.ValidOpcodes, + IsRegMemCompareAndSwap_16_32_64.ValidOpcodes + )>; + +def IsRegRegCompareAndSwap : CheckOpcode< + !listconcat( + IsRegRegCompareAndSwap_8.ValidOpcodes, + IsRegRegCompareAndSwap_16_32_64.ValidOpcodes + )>; + +def IsAtomicCompareAndSwap_8 : CheckAll<[ + CheckLockPrefix, + IsRegMemCompareAndSwap_8 +]>; + +def IsAtomicCompareAndSwap : CheckAll<[ + CheckLockPrefix, + IsRegMemCompareAndSwap +]>; + +def IsAtomicCompareAndSwap8B : CheckAll<[ + CheckLockPrefix, + IsCompareAndSwap8B +]>; + +def IsAtomicCompareAndSwap16B : CheckAll<[ + CheckLockPrefix, + IsCompareAndSwap16B +]>; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index d40bdf728a48..26d4d8fa3549 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -208,8 +208,12 @@ defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>; -defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; -defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; + +defm : X86WriteRes<WriteFMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; + defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 8f3e4ae62d53..9a511ecc0071 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; + defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 58caf1dacfcb..a8c65435ab9b 100755 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; + defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 55ca85ec1e3d..95f710061aeb 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -102,6 +102,12 @@ class X86SchedWriteMoveLS<SchedWrite MoveRR, SchedWrite MR = StoreMR; } +// Multiclass that wraps masked load/store writes for a vector width. +class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> { + SchedWrite RM = LoadRM; + SchedWrite MR = StoreMR; +} + // Multiclass that wraps X86SchedWriteMoveLS for each vector width. class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl, X86SchedWriteMoveLS s128, @@ -218,8 +224,12 @@ def WriteFStoreY : SchedWrite; def WriteFStoreNT : SchedWrite; def WriteFStoreNTX : SchedWrite; def WriteFStoreNTY : SchedWrite; -def WriteFMaskedStore : SchedWrite; -def WriteFMaskedStoreY : SchedWrite; + +def WriteFMaskedStore32 : SchedWrite; +def WriteFMaskedStore64 : SchedWrite; +def WriteFMaskedStore32Y : SchedWrite; +def WriteFMaskedStore64Y : SchedWrite; + def WriteFMove : SchedWrite; def WriteFMoveX : SchedWrite; def WriteFMoveY : SchedWrite; @@ -530,6 +540,16 @@ def SchedWriteVecMoveLSNT : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX, WriteVecMoveLSNTY, WriteVecMoveLSNTY>; +// Conditional SIMD Packed Loads and Stores wrappers. +def WriteFMaskMove32 + : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>; +def WriteFMaskMove64 + : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>; +def WriteFMaskMove32Y + : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>; +def WriteFMaskMove64Y + : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>; + // Vector width wrappers. def SchedWriteFAdd : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index b0334655de7e..78acb1065ec8 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -216,8 +216,10 @@ defm : X86WriteResUnsupported<WriteFStoreY>; def : WriteRes<WriteFStoreNT, [AtomPort0]>; def : WriteRes<WriteFStoreNTX, [AtomPort0]>; defm : X86WriteResUnsupported<WriteFStoreNTY>; -defm : X86WriteResUnsupported<WriteFMaskedStore>; -defm : X86WriteResUnsupported<WriteFMaskedStoreY>; +defm : X86WriteResUnsupported<WriteFMaskedStore32>; +defm : X86WriteResUnsupported<WriteFMaskedStore32Y>; +defm : X86WriteResUnsupported<WriteFMaskedStore64>; +defm : X86WriteResUnsupported<WriteFMaskedStore64Y>; def : WriteRes<WriteFMove, [AtomPort01]>; def : WriteRes<WriteFMoveX, [AtomPort01]>; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index 8cc01c3acece..d7aea3cf4e9d 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -726,8 +726,10 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>; -defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; -defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; +defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; +defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; +defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; +defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>; defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>; diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 2d26232b4132..d0421d94ee05 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -180,9 +180,11 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, // Instructions that have local forwarding disabled have an extra +1cy latency. -// A folded store needs a cycle on the SAGU for the store data, -// most RMW instructions don't need an extra uop. -defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>; +// A folded store needs a cycle on the SAGU for the store data, most RMW +// instructions don't need an extra uop. ALU RMW operations don't seem to +// benefit from STLF, and their observed latency is 6cy. That is the reason why +// this write adds two extra cycles (instead of just 1cy for the store). +defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; //////////////////////////////////////////////////////////////////////////////// // Arithmetic. @@ -191,22 +193,22 @@ defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>; defm : JWriteResIntPair<WriteALU, [JALU01], 1>; defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; -defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; -defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; -defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>; -defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>; -defm : X86WriteRes<WriteXCHG, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; +defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; +defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; -defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; -defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>; -defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>; +defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; +defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; +defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; +defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; +defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; +defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; @@ -305,6 +307,192 @@ def : WriteRes<WriteFence, [JSAGU]>; // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } +def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { + let Latency = 3; + let ResourceCycles = [3]; + let NumMicroOps = 3; +} + +def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 16; + let ResourceCycles = [3,16,16]; + let NumMicroOps = 5; +} + +def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 17; + let ResourceCycles = [3,17,17]; + let NumMicroOps = 6; +} + +def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 11; + let ResourceCycles = [3,1,1]; + let NumMicroOps = 5; +} + +def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 11; + let ResourceCycles = [3,1,1]; + let NumMicroOps = 18; +} + +def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 32; + let ResourceCycles = [6,1,1]; + let NumMicroOps = 28; +} + +def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 19; + let ResourceCycles = [3,19,19]; + let NumMicroOps = 18; +} + +def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 38; + let ResourceCycles = [6,38,38]; + let NumMicroOps = 28; +} + +def JWriteCMPXCHGVariant : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, + SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, + SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, + SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, + SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, + SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, + SchedVar<NoSchedPred, [WriteCMPXCHG]> +]>; + +// The first five reads are contributed by the memory load operand. +// We ignore those reads and set a read-advance for the other input operands +// including the implicit read of RAX. +def : InstRW<[JWriteCMPXCHGVariant, + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, + LCMPXCHG32, LCMPXCHG64, + CMPXCHG8rm, CMPXCHG16rm, + CMPXCHG32rm, CMPXCHG64rm)>; + +def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, + CMPXCHG32rr, CMPXCHG64rr)>; + +def : InstRW<[JWriteCMPXCHGVariant, + // Ignore reads contributed by the memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Add a read-advance to every implicit register read. + ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, + CMPXCHG8B, CMPXCHG16B)>; + +def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 19; + let ResourceCycles = [1,19,19]; + let NumMicroOps = 1; +} + +def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, + SchedVar<NoSchedPred, [WriteALURMW]> +]>; +def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, + DEC8m, DEC16m, DEC32m, DEC64m, + NOT8m, NOT16m, NOT32m, NOT64m, + NEG8m, NEG16m, NEG32m, NEG64m)>; + +def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { + let Latency = 2; + let ResourceCycles = [3]; + let NumMicroOps = 3; +} +def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, + XADD32rr, XADD64rr)>; + +// This write defines the latency of the in/out register operand of a non-atomic +// XADDrm. This is the first of a pair of writes that model non-atomic +// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). +// +// We need two writes because the instruction latency differs from the output +// register operand latency. In particular, the first write describes the first +// (and only) output register operand of the instruction. However, the +// instruction latency is set to the MAX of all the write latencies. That's why +// a second write is needed in this case (see example below). +// +// Example: +// XADD %ecx, (%rsp) ## Instruction latency: 11cy +// ## ECX write Latency: 3cy +// +// Register ECX becomes available in 3 cycles. That is because the value of ECX +// is exchanged with the value read from the stack pointer, and the load-to-use +// latency is assumed to be 3cy. +def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { + let Latency = 3; // load-to-use latency + let ResourceCycles = [3]; + let NumMicroOps = 3; +} + +// This write defines the latency of the in/out register operand of an atomic +// XADDrm. This is the first of a sequence of two writes used to model atomic +// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. +// +// +// Example: +// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy +// ## ECX write Latency: 11cy +// +// The value of ECX becomes available only after 11cy from the start of +// execution. This write is used to specifically set that operand latency. +def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { + let Latency = 11; + let ResourceCycles = [3]; + let NumMicroOps = 3; +} + +// This write defines the latency of the in/out register operand of an atomic +// XCHGrm. This write is the first of a sequence of two writes that describe +// atomic XCHG operations. We need two writes because the instruction latency +// differs from the output register write latency. We want to make sure that +// the output register operand becomes visible after 11cy. However, we want to +// set the instruction latency to 16cy. +def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { + let Latency = 11; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { + let Latency = 11; + let ResourceCycles = [1, 1]; + let NumMicroOps = 1; +} + +def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { + let Latency = 16; + let ResourceCycles = [16, 16]; + let NumMicroOps = 1; +} + +def JWriteXADDrm_Part1 : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, + SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> +]>; + +def JWriteXADDrm_Part2 : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, + SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> +]>; + +def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], + (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, + LXADD8, LXADD16, LXADD32, LXADD64)>; + +def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], + (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; + + //////////////////////////////////////////////////////////////////////////////// // Floating point. This covers both scalar and vector operations. //////////////////////////////////////////////////////////////////////////////// @@ -313,19 +501,22 @@ defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; -defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; -defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>; -defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; +defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; +defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; +defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; @@ -466,8 +657,8 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; //////////////////////////////////////////////////////////////////////////////// defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; @@ -475,7 +666,7 @@ defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; -defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; @@ -631,6 +822,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> { def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; /////////////////////////////////////////////////////////////////////////////// +// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ +/////////////////////////////////////////////////////////////////////////////// + +def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { + let Latency = 34; + let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; + let NumMicroOps = 63; +} +def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, + VMASKMOVDQU, VMASKMOVDQU64)>; + +/////////////////////////////////////////////////////////////////////////////// // SchedWriteVariant definitions. /////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 34c251a5c5bb..8e3ce721f1a1 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -186,8 +186,12 @@ def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>; -def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>; -def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>; + +def : WriteRes<WriteFMaskedStore32, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore32Y, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore64, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>; + def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>; diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 65f6d89df610..06201f4a3a84 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -268,8 +268,12 @@ defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>; -defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; -defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; + defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 50690953eef5..1ae8df977f83 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -36,7 +36,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( DAG.getSubtarget().getRegisterInfo()); - unsigned BaseReg = TRI->getBaseRegister(); + Register BaseReg = TRI->getBaseRegister(); for (unsigned R : ClobberSet) if (BaseReg == R) return true; diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 40f5dbe57e4b..b8980789258e 100644 --- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -477,7 +477,7 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction( // Otherwise, just build the predicate state itself by zeroing a register // as we don't need any initial state. PS->InitialReg = MRI->createVirtualRegister(PS->RC); - unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0), PredStateSubReg); ++NumInstsInserted; @@ -750,7 +750,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); - unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); + Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC); // Note that we intentionally use an empty debug location so that // this picks up the preceding location. auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(), @@ -907,7 +907,7 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( MI.dump(); dbgs() << "\n"); report_fatal_error("Unable to unfold load!"); } - unsigned Reg = MRI->createVirtualRegister(UnfoldedRC); + Register Reg = MRI->createVirtualRegister(UnfoldedRC); SmallVector<MachineInstr *, 2> NewMIs; // If we were able to compute an unfolded reg class, any failure here // is just a programming error so just assert. @@ -1102,7 +1102,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // synthetic target in the predecessor. We do this at the bottom of the // predecessor. auto InsertPt = Pred->getFirstTerminator(); - unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass); if (MF.getTarget().getCodeModel() == CodeModel::Small && !Subtarget->isPositionIndependent()) { // Directly materialize it into an immediate. @@ -1153,7 +1153,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n"); } else { // Otherwise compute the address into a register first. - unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass); auto AddrI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg) .addReg(/*Base*/ X86::RIP) @@ -1175,7 +1175,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // Now cmov over the predicate if the comparison wasn't equal. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); - unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); + Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg) .addReg(PS->InitialReg) @@ -1878,7 +1878,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS( DebugLoc Loc) { // FIXME: Hard coding this to a 32-bit register class seems weird, but matches // what instruction selection does. - unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass); // We directly copy the FLAGS register and rely on later lowering to clean // this up into the appropriate setCC instructions. BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS); @@ -1905,7 +1905,7 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS( void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, unsigned PredStateReg) { - unsigned TmpReg = MRI->createVirtualRegister(PS->RC); + Register TmpReg = MRI->createVirtualRegister(PS->RC); // FIXME: This hard codes a shift distance based on the number of bits needed // to stay canonical on 64-bit. We should compute this somehow and support // 32-bit as part of that. @@ -1925,8 +1925,8 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { - unsigned PredStateReg = MRI->createVirtualRegister(PS->RC); - unsigned TmpReg = MRI->createVirtualRegister(PS->RC); + Register PredStateReg = MRI->createVirtualRegister(PS->RC); + Register TmpReg = MRI->createVirtualRegister(PS->RC); // We know that the stack pointer will have any preserved predicate state in // its high bit. We just want to smear this across the other bits. Turns out, @@ -2031,9 +2031,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( } for (MachineOperand *Op : HardenOpRegs) { - unsigned OpReg = Op->getReg(); + Register OpReg = Op->getReg(); auto *OpRC = MRI->getRegClass(OpReg); - unsigned TmpReg = MRI->createVirtualRegister(OpRC); + Register TmpReg = MRI->createVirtualRegister(OpRC); // If this is a vector register, we'll need somewhat custom logic to handle // hardening it. @@ -2045,7 +2045,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( // Move our state into a vector register. // FIXME: We could skip this at the cost of longer encodings with AVX-512 // but that doesn't seem likely worth it. - unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass); + Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass); auto MovI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg) .addReg(StateReg); @@ -2054,7 +2054,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n"); // Broadcast it across the vector register. - unsigned VBStateReg = MRI->createVirtualRegister(OpRC); + Register VBStateReg = MRI->createVirtualRegister(OpRC); auto BroadcastI = BuildMI(MBB, InsertPt, Loc, TII->get(Is128Bit ? X86::VPBROADCASTQrr : X86::VPBROADCASTQYrr), @@ -2084,7 +2084,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!"); // Broadcast our state into a vector register. - unsigned VStateReg = MRI->createVirtualRegister(OpRC); + Register VStateReg = MRI->createVirtualRegister(OpRC); unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128r : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr; @@ -2153,7 +2153,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // See if we can sink hardening the loaded value. auto SinkCheckToSingleUse = [&](MachineInstr &MI) -> Optional<MachineInstr *> { - unsigned DefReg = MI.getOperand(0).getReg(); + Register DefReg = MI.getOperand(0).getReg(); // We need to find a single use which we can sink the check. We can // primarily do this because many uses may already end up checked on their @@ -2210,8 +2210,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // If this register isn't a virtual register we can't walk uses of sanely, // just bail. Also check that its register class is one of the ones we // can harden. - unsigned UseDefReg = UseMI.getOperand(0).getReg(); - if (!TRI->isVirtualRegister(UseDefReg) || + Register UseDefReg = UseMI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(UseDefReg) || !canHardenRegister(UseDefReg)) return {}; @@ -2241,6 +2241,9 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) { // We don't support post-load hardening of vectors. return false; + unsigned RegIdx = Log2_32(RegBytes); + assert(RegIdx < 4 && "Unsupported register size"); + // If this register class is explicitly constrained to a class that doesn't // require REX prefix, we may not be able to satisfy that constraint when // emitting the hardening instructions, so bail out here. @@ -2251,13 +2254,13 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) { const TargetRegisterClass *NOREXRegClasses[] = { &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass, &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass}; - if (RC == NOREXRegClasses[Log2_32(RegBytes)]) + if (RC == NOREXRegClasses[RegIdx]) return false; const TargetRegisterClass *GPRRegClasses[] = { &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass, &X86::GR64RegClass}; - return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]); + return RC->hasSuperClassEq(GPRRegClasses[RegIdx]); } /// Harden a value in a register. @@ -2278,7 +2281,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { assert(canHardenRegister(Reg) && "Cannot harden this register!"); - assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!"); + assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!"); auto *RC = MRI->getRegClass(Reg); int Bytes = TRI->getRegSizeInBits(*RC) / 8; @@ -2289,7 +2292,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( if (Bytes != 8) { unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit}; unsigned SubRegImm = SubRegImms[Log2_32(Bytes)]; - unsigned NarrowStateReg = MRI->createVirtualRegister(RC); + Register NarrowStateReg = MRI->createVirtualRegister(RC); BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg) .addReg(StateReg, 0, SubRegImm); StateReg = NarrowStateReg; @@ -2299,7 +2302,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( if (isEFLAGSLive(MBB, InsertPt, *TRI)) FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); - unsigned NewReg = MRI->createVirtualRegister(RC); + Register NewReg = MRI->createVirtualRegister(RC); unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr}; unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)]; auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg) @@ -2329,13 +2332,13 @@ unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) { DebugLoc Loc = MI.getDebugLoc(); auto &DefOp = MI.getOperand(0); - unsigned OldDefReg = DefOp.getReg(); + Register OldDefReg = DefOp.getReg(); auto *DefRC = MRI->getRegClass(OldDefReg); // Because we want to completely replace the uses of this def'ed value with // the hardened value, create a dedicated new register that will only be used // to communicate the unhardened value to the hardening. - unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC); + Register UnhardenedReg = MRI->createVirtualRegister(DefRC); DefOp.setReg(UnhardenedReg); // Now harden this register's value, getting a hardened reg that is safe to @@ -2537,7 +2540,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( .addReg(ExpectedRetAddrReg, RegState::Kill) .addSym(RetSymbol); } else { - unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC); + Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC); BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg) .addReg(/*Base*/ X86::RIP) .addImm(/*Scale*/ 1) @@ -2554,7 +2557,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); - unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); + Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg) .addReg(NewStateReg, RegState::Kill) .addReg(PS->PoisonReg) @@ -2611,7 +2614,7 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr( // For all of these, the target register is the first operand of the // instruction. auto &TargetOp = MI.getOperand(0); - unsigned OldTargetReg = TargetOp.getReg(); + Register OldTargetReg = TargetOp.getReg(); // Try to lookup a hardened version of this register. We retain a reference // here as we want to update the map to track any newly computed hardened diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index d5bb56603df9..f8f78da52cc2 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -146,6 +146,9 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV, return X86II::MO_DLLIMPORT; return X86II::MO_COFFSTUB; } + // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables. + if (isOSWindows()) + return X86II::MO_NO_FLAG; if (is64Bit()) { // ELF supports a large, truly PIC code model with non-PC relative GOT @@ -285,10 +288,10 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both // 32 and 64 bit) and for all 64-bit targets. if (StackAlignOverride) - stackAlignment = StackAlignOverride; + stackAlignment = *StackAlignOverride; else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || isTargetKFreeBSD() || In64BitMode) - stackAlignment = 16; + stackAlignment = Align(16); // Some CPUs have more overhead for gather. The specified overhead is relative // to the Load operation. "2" is the number provided by Intel architects. This @@ -304,6 +307,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // Consume the vector width attribute or apply any target specific limit. if (PreferVectorWidthOverride) PreferVectorWidth = PreferVectorWidthOverride; + else if (Prefer128Bit) + PreferVectorWidth = 128; else if (Prefer256Bit) PreferVectorWidth = 256; } @@ -316,12 +321,11 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, - unsigned StackAlignOverride, + MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth) - : X86GenSubtargetInfo(TT, CPU, FS), - PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), - StackAlignOverride(StackAlignOverride), + : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM), + TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), @@ -355,7 +359,7 @@ const CallLowering *X86Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } -const InstructionSelector *X86Subtarget::getInstructionSelector() const { +InstructionSelector *X86Subtarget::getInstructionSelector() const { return InstSelector.get(); } diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 24ccc9cb7843..e8efe8f2afe5 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -365,8 +365,8 @@ protected: /// Processor has AVX-512 vp2intersect instructions bool HasVP2INTERSECT = false; - /// Processor supports MPX - Memory Protection Extensions - bool HasMPX = false; + /// Deprecated flag for MPX instructions. + bool DeprecatedHasMPX = false; /// Processor supports CET SHSTK - Control-Flow Enforcement Technology /// using Shadow Stack @@ -427,15 +427,21 @@ protected: /// Use software floating point for code generation. bool UseSoftFloat = false; + /// Use alias analysis during code generation. + bool UseAA = false; + /// The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. - unsigned stackAlignment = 4; + Align stackAlignment = Align(4); /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. /// // FIXME: this is a known good value for Yonah. How about others? unsigned MaxInlineSizeThreshold = 128; + /// Indicates target prefers 128 bit instructions. + bool Prefer128Bit = false; + /// Indicates target prefers 256 bit instructions. bool Prefer256Bit = false; @@ -453,7 +459,7 @@ protected: private: /// Override the stack alignment. - unsigned StackAlignOverride; + MaybeAlign StackAlignOverride; /// Preferred vector width from function attribute. unsigned PreferVectorWidthOverride; @@ -490,7 +496,7 @@ public: /// of the specified triple. /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, - const X86TargetMachine &TM, unsigned StackAlignOverride, + const X86TargetMachine &TM, MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth); @@ -515,7 +521,7 @@ public: /// Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. - unsigned getStackAlignment() const { return stackAlignment; } + Align getStackAlignment() const { return stackAlignment; } /// Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -527,7 +533,7 @@ public: /// Methods used by Global ISel const CallLowering *getCallLowering() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; @@ -684,7 +690,6 @@ public: bool hasBF16() const { return HasBF16; } bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } bool hasBITALG() const { return HasBITALG; } - bool hasMPX() const { return HasMPX; } bool hasSHSTK() const { return HasSHSTK; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } @@ -739,6 +744,7 @@ public: X86ProcFamily == IntelTRM; } bool useSoftFloat() const { return UseSoftFloat; } + bool useAA() const override { return UseAA; } /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for /// no-sse2). There isn't any reason to disable it if the target processor @@ -809,6 +815,7 @@ public: // On Win64, all these conventions just use the default convention. case CallingConv::C: case CallingConv::Fast: + case CallingConv::Tail: case CallingConv::Swift: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 0cbf13899a29..c15297134e4d 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -81,27 +81,28 @@ extern "C" void LLVMInitializeX86Target() { initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); + initializeX86OptimizeLEAPassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) { if (TT.getArch() == Triple::x86_64) - return llvm::make_unique<X86_64MachoTargetObjectFile>(); - return llvm::make_unique<TargetLoweringObjectFileMachO>(); + return std::make_unique<X86_64MachoTargetObjectFile>(); + return std::make_unique<TargetLoweringObjectFileMachO>(); } if (TT.isOSFreeBSD()) - return llvm::make_unique<X86FreeBSDTargetObjectFile>(); + return std::make_unique<X86FreeBSDTargetObjectFile>(); if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) - return llvm::make_unique<X86LinuxNaClTargetObjectFile>(); + return std::make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSSolaris()) - return llvm::make_unique<X86SolarisTargetObjectFile>(); + return std::make_unique<X86SolarisTargetObjectFile>(); if (TT.isOSFuchsia()) - return llvm::make_unique<X86FuchsiaTargetObjectFile>(); + return std::make_unique<X86FuchsiaTargetObjectFile>(); if (TT.isOSBinFormatELF()) - return llvm::make_unique<X86ELFTargetObjectFile>(); + return std::make_unique<X86ELFTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) - return llvm::make_unique<TargetLoweringObjectFileCOFF>(); + return std::make_unique<TargetLoweringObjectFileCOFF>(); llvm_unreachable("unknown subtarget type"); } @@ -116,6 +117,9 @@ static std::string computeDataLayout(const Triple &TT) { !TT.isArch64Bit()) Ret += "-p:32:32"; + // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers. + Ret += "-p270:32:32-p271:32:32-p272:64:64"; + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) Ret += "-i64:64"; @@ -218,17 +222,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL), TLOF(createTLOF(getTargetTriple())) { - // Windows stack unwinder gets confused when execution flow "falls through" - // after a call to 'noreturn' function. - // To prevent that, we emit a trap for 'unreachable' IR instructions. - // (which on X86, happens to be the 'ud2' instruction) // On PS4, the "return address" of a 'noreturn' call must still be within // the calling function, and TrapUnreachable is an easy way to get that. - // The check here for 64-bit windows is a bit icky, but as we're unlikely - // to ever want to mix 32 and 64-bit windows code in a single module - // this should be fine. - if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() || - TT.isOSBinFormatMachO()) { + if (TT.isPS4() || TT.isOSBinFormatMachO()) { this->Options.TrapUnreachable = true; this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO(); } @@ -311,10 +307,10 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, - Options.StackAlignmentOverride, - PreferVectorWidthOverride, - RequiredVectorWidth); + I = std::make_unique<X86Subtarget>( + TargetTriple, CPU, FS, *this, + MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride, + RequiredVectorWidth); } return I.get(); } @@ -517,12 +513,19 @@ void X86PassConfig::addPreEmitPass() { } void X86PassConfig::addPreEmitPass2() { + const Triple &TT = TM->getTargetTriple(); + const MCAsmInfo *MAI = TM->getMCAsmInfo(); + addPass(createX86RetpolineThunksPass()); + + // Insert extra int3 instructions after trailing call instructions to avoid + // issues in the unwinder. + if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) + addPass(createX86AvoidTrailingCallPass()); + // Verify basic block incoming and outgoing cfa offset and register values and // correct CFA calculation rule where needed by inserting appropriate CFI // instructions. - const Triple &TT = TM->getTargetTriple(); - const MCAsmInfo *MAI = TM->getMCAsmInfo(); if (!TT.isOSDarwin() && (!TT.isOSWindows() || MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI)) diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index b999e2e86af6..ec3db7b1e9e8 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -16,7 +16,6 @@ #include "X86Subtarget.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/CodeGen.h" #include "llvm/Target/TargetMachine.h" #include <memory> @@ -26,6 +25,7 @@ namespace llvm { class StringRef; class X86Subtarget; class X86RegisterBankInfo; +class TargetTransformInfo; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 92e0779c2e74..44185957686b 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -47,8 +47,8 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( } const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( - const MCSymbol *Sym, const MCValue &MV, int64_t Offset, - MachineModuleInfo *MMI, MCStreamer &Streamer) const { + const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV, + int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const { // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry // from a data section. In case there's an additional offset, then use // foo@GOTPCREL+4+<offset>. diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 13d7b4ad70d6..1fd0bbf56b19 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -30,7 +30,8 @@ namespace llvm { const TargetMachine &TM, MachineModuleInfo *MMI) const override; - const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV, + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 3dc59aeb263e..70fd857fcf01 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } -unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { +unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector && !ST->hasSSE1()) return 0; @@ -887,7 +888,7 @@ int X86TTIImpl::getArithmeticInstrCost( int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // Treat Transpose as 2-op shuffles - there's no difference in lowering. @@ -911,6 +912,39 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) return SubLT.first; + // Handle some cases for widening legalization. For now we only handle + // cases where the original subvector was naturally aligned and evenly + // fit in its legalized subvector type. + // FIXME: Remove some of the alignment restrictions. + // FIXME: We can use permq for 64-bit or larger extracts from 256-bit + // vectors. + int OrigSubElts = SubTp->getVectorNumElements(); + if (NumSubElts > OrigSubElts && + (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 && + LT.second.getVectorElementType() == + SubLT.second.getVectorElementType() && + LT.second.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits()) { + assert(NumElts >= NumSubElts && NumElts > OrigSubElts && + "Unexpected number of elements!"); + Type *VecTy = VectorType::get(Tp->getVectorElementType(), + LT.second.getVectorNumElements()); + Type *SubTy = VectorType::get(Tp->getVectorElementType(), + SubLT.second.getVectorNumElements()); + int ExtractIndex = alignDown((Index % NumElts), NumSubElts); + int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, + ExtractIndex, SubTy); + + // If the original size is 32-bits or more, we can use pshufd. Otherwise + // if we have SSSE3 we can use pshufb. + if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) + return ExtractCost + 1; // pshufd or pshufb + + assert(SubTp->getPrimitiveSizeInBits() == 16 && + "Unexpected vector size"); + + return ExtractCost + 2; // worst case pshufhw + pshufd + } } } @@ -1314,8 +1348,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, @@ -1354,6 +1390,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, @@ -1371,14 +1409,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, @@ -1402,13 +1440,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, @@ -1421,7 +1459,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, @@ -1507,6 +1548,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, }; @@ -1520,7 +1562,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, @@ -1536,6 +1579,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, @@ -1562,15 +1607,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW + { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD }; std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); @@ -1691,6 +1742,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, } } + static const CostTblEntry SLMCostTbl[] = { + // slm pcmpeq/pcmpgt throughput is 2 + { ISD::SETCC, MVT::v2i64, 2 }, + }; + static const CostTblEntry AVX512BWCostTbl[] = { { ISD::SETCC, MVT::v32i16, 1 }, { ISD::SETCC, MVT::v64i8, 1 }, @@ -1777,6 +1833,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps }; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); + if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) return LT.first * (ExtraCost + Entry->Cost); @@ -2043,8 +2103,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; + static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets + { ISD::CTLZ, MVT::i64, 1 }, + }; + static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTLZ, MVT::i32, 1 }, + { ISD::CTLZ, MVT::i16, 1 }, + { ISD::CTLZ, MVT::i8, 1 }, + }; + static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets + { ISD::CTPOP, MVT::i64, 1 }, + }; + static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTPOP, MVT::i32, 1 }, + { ISD::CTPOP, MVT::i16, 1 }, + { ISD::CTPOP, MVT::i8, 1 }, + }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTPOP, MVT::i64, 10 }, { ISD::SADDO, MVT::i64, 1 }, { ISD::UADDO, MVT::i64, 1 }, }; @@ -2052,6 +2130,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTPOP, MVT::i32, 8 }, + { ISD::CTPOP, MVT::i16, 9 }, + { ISD::CTPOP, MVT::i8, 7 }, { ISD::SADDO, MVT::i32, 1 }, { ISD::SADDO, MVT::i16, 1 }, { ISD::SADDO, MVT::i8, 1 }, @@ -2163,6 +2247,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasLZCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + if (ST->hasPOPCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + // TODO - add BMI (TZCNT) scalar handling + if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -2357,8 +2461,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) || - (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { + if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) || + (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) || + !isPowerOf2_32(NumElem)) { // Scalarization int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); int ScalarCompareCost = getCmpSelInstrCost( @@ -2425,70 +2530,107 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) { - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - - MVT MTy = LT.second; - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SSE42CostTblPairWise[] = { + static const CostTblEntry SSE2CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32. { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 + { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 { ISD::ADD, MVT::v8i16, 5 }, + { ISD::ADD, MVT::v2i8, 2 }, + { ISD::ADD, MVT::v4i8, 2 }, + { ISD::ADD, MVT::v8i8, 2 }, + { ISD::ADD, MVT::v16i8, 3 }, }; static const CostTblEntry AVX1CostTblPairWise[] = { - { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". - { ISD::ADD, MVT::v8i16, 5 }, { ISD::ADD, MVT::v8i32, 5 }, + { ISD::ADD, MVT::v16i16, 6 }, + { ISD::ADD, MVT::v32i8, 4 }, }; - static const CostTblEntry SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE2CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". + { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v2i8, 2 }, + { ISD::ADD, MVT::v4i8, 2 }, + { ISD::ADD, MVT::v8i8, 2 }, + { ISD::ADD, MVT::v16i8, 3 }, }; static const CostTblEntry AVX1CostTblNoPairWise[] = { - { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, + { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v8f32, 4 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". { ISD::ADD, MVT::v4i64, 3 }, - { ISD::ADD, MVT::v8i16, 4 }, { ISD::ADD, MVT::v8i32, 5 }, + { ISD::ADD, MVT::v16i16, 5 }, + { ISD::ADD, MVT::v32i8, 4 }, }; + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // Before legalizing the type, give a chance to look up illegal narrow types + // in the table. + // FIXME: Is there a better way to do this? + EVT VT = TLI->getValueType(DL, ValTy); + if (VT.isSimple()) { + MVT MTy = VT.getSimpleVT(); + if (IsPairwise) { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) + return Entry->Cost; + } else { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + } + } + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + if (IsPairwise) { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; } else { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; } @@ -3116,7 +3258,7 @@ bool X86TTIImpl::canMacroFuseCmp() { return ST->hasMacroFusion() || ST->hasBranchFusion(); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { if (!ST->hasAVX()) return false; @@ -3139,11 +3281,11 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { - return isLegalMaskedLoad(DataType); +bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + return isLegalMaskedLoad(DataType, Alignment); } -bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { +bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { unsigned DataSize = DL.getTypeStoreSize(DataType); // The only supported nontemporal loads are for aligned vectors of 16 or 32 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 @@ -3154,7 +3296,7 @@ bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { return false; } -bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) { +bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { unsigned DataSize = DL.getTypeStoreSize(DataType); // SSE4A supports nontemporal stores of float and double at arbitrary @@ -3299,9 +3441,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { if (IsZeroCmp) { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). - // TODO: enable AVX512 when the DAG is ready. - // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); const unsigned PreferredWidth = ST->getPreferVectorWidth(); + if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); // All GPR and vector loads can be unaligned. SIMD compare requires integer diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 25d9c33eb16d..7581257f41f8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -83,6 +83,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { X86::FeatureSlowUAMem32, // Based on whether user set the -mprefer-vector-width command line. + X86::FeaturePrefer128Bit, X86::FeaturePrefer256Bit, // CPU name enums. These just follow CPU string. @@ -115,7 +116,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); @@ -184,10 +185,10 @@ public: bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); - bool isLegalMaskedLoad(Type *DataType); - bool isLegalMaskedStore(Type *DataType); - bool isLegalNTLoad(Type *DataType, unsigned Alignment); - bool isLegalNTStore(Type *DataType, unsigned Alignment); + bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment); + bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment); + bool isLegalNTLoad(Type *DataType, Align Alignment); + bool isLegalNTStore(Type *DataType, Align Alignment); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); bool isLegalMaskedExpandLoad(Type *DataType); diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index a07d2f20acab..9280d030b5d5 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -292,8 +292,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { // need to insert any VZEROUPPER instructions. This is constant-time, so it // is cheap in the common case of no ymm/zmm use. bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm; - const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass}; - for (auto *RC : RCs) { + for (auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) { if (!YmmOrZmmUsed) { for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; i++) { @@ -304,9 +303,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { } } } - if (!YmmOrZmmUsed) { + if (!YmmOrZmmUsed) return false; - } assert(BlockStates.empty() && DirtySuccessors.empty() && "X86VZeroUpper state should be clear"); diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp index 9e499db1d7ee..ae72c6427588 100644 --- a/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/lib/Target/X86/X86WinAllocaExpander.cpp @@ -81,7 +81,7 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) { MI->getOpcode() == X86::WIN_ALLOCA_64); assert(MI->getOperand(0).isReg()); - unsigned AmountReg = MI->getOperand(0).getReg(); + Register AmountReg = MI->getOperand(0).getReg(); MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg); if (!Def || @@ -261,7 +261,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { break; } - unsigned AmountReg = MI->getOperand(0).getReg(); + Register AmountReg = MI->getOperand(0).getReg(); MI->eraseFromParent(); // Delete the definition of AmountReg. diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index f68d17d7256d..d65e1f3ab414 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -339,7 +339,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { if (UseStackGuard) { Value *Val = Builder.CreateLoad(Int32Ty, Cookie); Value *FrameAddr = Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress), + Intrinsic::getDeclaration( + TheModule, Intrinsic::frameaddress, + Builder.getInt8PtrTy( + TheModule->getDataLayout().getAllocaAddrSpace())), Builder.getInt32(0), "frameaddr"); Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty); FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val); diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp index 9f615b9e7741..6b3dc27cb886 100644 --- a/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -115,7 +115,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { MCSymbol *GVSym = getSymbol(GV); const Constant *C = GV->getInitializer(); - unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType()); + const Align Alignment(DL.getPrefTypeAlignment(C->getType())); // Mark the start of the global getTargetStreamer().emitCCTopData(GVSym->getName()); @@ -143,7 +143,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { llvm_unreachable("Unknown linkage type!"); } - EmitAlignment(Align > 2 ? Align : 2, GV); + EmitAlignment(std::max(Alignment, Align(4)), GV); if (GV->isThreadLocal()) { report_fatal_error("TLS is not supported by this target!"); diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index 5066407c74aa..fd8b37e26e47 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -211,7 +211,7 @@ static void RestoreSpillList(MachineBasicBlock &MBB, //===----------------------------------------------------------------------===// XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) { + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0) { // Do nothing } @@ -367,8 +367,8 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF, RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList); // Return to the landing pad. - unsigned EhStackReg = MBBI->getOperand(0).getReg(); - unsigned EhHandlerReg = MBBI->getOperand(1).getReg(); + Register EhStackReg = MBBI->getOperand(0).getReg(); + Register EhHandlerReg = MBBI->getOperand(1).getReg(); BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg); BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg); MBB.erase(MBBI); // Erase the previous return instruction. diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp index e433d21c59b7..b5dbdea98eea 100644 --- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp +++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp @@ -55,7 +55,7 @@ bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) { MBBI != EE; ++MBBI) { if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) { MachineInstr &OldInst = *MBBI; - unsigned Reg = OldInst.getOperand(0).getReg(); + Register Reg = OldInst.getOperand(0).getReg(); MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize); OldInst.eraseFromParent(); } diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 072278d9fc46..bf006fd673f1 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -171,8 +171,8 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); - setMinFunctionAlignment(1); - setPrefFunctionAlignment(2); + setMinFunctionAlignment(Align(2)); + setPrefFunctionAlignment(Align(4)); } bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { @@ -414,8 +414,8 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { "Unexpected extension type"); assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT"); - if (allowsMemoryAccess(Context, DAG.getDataLayout(), LD->getMemoryVT(), - *LD->getMemOperand())) + if (allowsMemoryAccessForAlignment(Context, DAG.getDataLayout(), + LD->getMemoryVT(), *LD->getMemOperand())) return SDValue(); SDValue Chain = LD->getChain(); @@ -488,8 +488,8 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(!ST->isTruncatingStore() && "Unexpected store type"); assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT"); - if (allowsMemoryAccess(Context, DAG.getDataLayout(), ST->getMemoryVT(), - *ST->getMemOperand())) + if (allowsMemoryAccessForAlignment(Context, DAG.getDataLayout(), + ST->getMemoryVT(), *ST->getMemOperand())) return SDValue(); SDValue Chain = ST->getChain(); @@ -1309,7 +1309,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments( llvm_unreachable(nullptr); } case MVT::i32: - unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass); + Register VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1)); @@ -1360,7 +1360,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments( offset -= StackSlotSize; SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); // Move argument from phys reg -> virt reg - unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass); + Register VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass); RegInfo.addLiveIn(ArgRegs[i], VReg); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1)); @@ -1780,8 +1780,9 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, // Replace unaligned store of unaligned load with memmove. StoreSDNode *ST = cast<StoreSDNode>(N); if (!DCI.isBeforeLegalize() || - allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - ST->getMemoryVT(), *ST->getMemOperand()) || + allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + ST->getMemoryVT(), + *ST->getMemOperand()) || ST->isVolatile() || ST->isIndexed()) { break; } diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index 3752274e2cdf..86ec7f82d4d1 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -301,7 +301,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, << "<--------->\n"); Offset/=4; - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand"); if (TFI->hasFP(MF)) { diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 2a8cd6b657b7..b5b7445265b7 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -53,7 +53,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT, T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32", TT, CPU, FS, Options, getEffectiveRelocModel(RM), getEffectiveXCoreCodeModel(CM), OL), - TLOF(llvm::make_unique<XCoreTargetObjectFile>()), + TLOF(std::make_unique<XCoreTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h index 3fecaaa59722..58df1f290ec9 100644 --- a/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -40,7 +40,8 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { return 0; } |
