diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-26 20:32:52 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-26 20:32:52 +0000 |
commit | 08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (patch) | |
tree | 80108f0f128657f8623f8f66ad9735b4d88e7b47 /lib/Target | |
parent | 7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (diff) |
Notes
Diffstat (limited to 'lib/Target')
157 files changed, 4121 insertions, 1766 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 3e0e3978b90b5..37b9690d0434a 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -31,6 +31,7 @@ class MachineFunctionPass; FunctionPass *createAArch64DeadRegisterDefinitions(); FunctionPass *createAArch64RedundantCopyEliminationPass(); +FunctionPass *createAArch64CondBrTuning(); FunctionPass *createAArch64ConditionalCompares(); FunctionPass *createAArch64AdvSIMDScalar(); FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, @@ -55,6 +56,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); void initializeAArch64AdvSIMDScalarPass(PassRegistry&); void initializeAArch64CollectLOHPass(PassRegistry&); +void initializeAArch64CondBrTuningPass(PassRegistry &); void initializeAArch64ConditionalComparesPass(PassRegistry&); void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp new file mode 100644 index 0000000000000..f27bc97ec3f3e --- /dev/null +++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -0,0 +1,336 @@ +//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions +/// into a conditional branch (B.cond), when the NZCV flags can be set for +/// "free". This is preferred on targets that have more flexibility when +/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming +/// all other variables are equal). This can also reduce register pressure. +/// +/// A few examples: +/// +/// 1) add w8, w0, w1 -> cmn w0, w1 ; CMN is an alias of ADDS. +/// cbz w8, .LBB_2 -> b.eq .LBB0_2 +/// +/// 2) add w8, w0, w1 -> adds w8, w0, w1 ; w8 has multiple uses. +/// cbz w8, .LBB1_2 -> b.eq .LBB1_2 +/// +/// 3) sub w8, w0, w1 -> subs w8, w0, w1 ; w8 has multiple uses. +/// tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2 +/// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-cond-br-tuning" +#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning" + +namespace { +class AArch64CondBrTuning : public MachineFunctionPass { + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + + MachineRegisterInfo *MRI; + +public: + static char ID; + AArch64CondBrTuning() : MachineFunctionPass(ID) { + initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; } + +private: + MachineInstr *getOperandDef(const MachineOperand &MO); + MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting); + MachineInstr *convertToCondBr(MachineInstr &MI); + bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI); +}; +} // end anonymous namespace + +char AArch64CondBrTuning::ID = 0; + +INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning", + AARCH64_CONDBR_TUNING_NAME, false, false) + +void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) { + if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return nullptr; + return MRI->getUniqueVRegDef(MO.getReg()); +} + +MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI, + bool IsFlagSetting) { + // If this is already the flag setting version of the instruction (e.g., SUBS) + // just make sure the implicit-def of NZCV isn't marked dead. + if (IsFlagSetting) { + for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands(); + I != E; ++I) { + MachineOperand &MO = MI.getOperand(I); + if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV) + MO.setIsDead(false); + } + return &MI; + } + bool Is64Bit; + unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit); + unsigned NewDestReg = MI.getOperand(0).getReg(); + if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg())) + NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(NewOpc), NewDestReg); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + + return MIB; +} + +MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) { + AArch64CC::CondCode CC; + MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI); + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + + case AArch64::CBZW: + case AArch64::CBZX: + CC = AArch64CC::EQ; + break; + case AArch64::CBNZW: + case AArch64::CBNZX: + CC = AArch64CC::NE; + break; + case AArch64::TBZW: + case AArch64::TBZX: + CC = AArch64CC::GE; + break; + case AArch64::TBNZW: + case AArch64::TBNZX: + CC = AArch64CC::LT; + break; + } + return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TargetMBB); +} + +bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, + MachineInstr &DefMI) { + // We don't want NZCV bits live across blocks. + if (MI.getParent() != DefMI.getParent()) + return false; + + bool IsFlagSetting = true; + unsigned MIOpc = MI.getOpcode(); + MachineInstr *NewCmp = nullptr, *NewBr = nullptr; + switch (DefMI.getOpcode()) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDWrr: + case AArch64::ADDWrs: + case AArch64::ADDWrx: + case AArch64::ANDWri: + case AArch64::ANDWrr: + case AArch64::ANDWrs: + case AArch64::BICWrr: + case AArch64::BICWrs: + case AArch64::SUBWri: + case AArch64::SUBWrr: + case AArch64::SUBWrs: + case AArch64::SUBWrx: + IsFlagSetting = false; + case AArch64::ADDSWri: + case AArch64::ADDSWrr: + case AArch64::ADDSWrs: + case AArch64::ADDSWrx: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::BICSWrr: + case AArch64::BICSWrs: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSWrs: + case AArch64::SUBSWrx: + switch (MIOpc) { + default: + llvm_unreachable("Unexpected opcode!"); + + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::TBZW: + case AArch64::TBNZW: + // Check to see if the TBZ/TBNZ is checking the sign bit. + if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) && + MI.getOperand(1).getImm() != 31) + return false; + + // There must not be any instruction between DefMI and MI that clobbers or + // reads NZCV. + MachineBasicBlock::iterator I(DefMI), E(MI); + for (I = std::next(I); I != E; ++I) { + if (I->modifiesRegister(AArch64::NZCV, TRI) || + I->readsRegister(AArch64::NZCV, TRI)) + return false; + } + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(DefMI.print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MI.print(dbgs())); + + NewCmp = convertToFlagSetting(DefMI, IsFlagSetting); + NewBr = convertToCondBr(MI); + break; + } + break; + + case AArch64::ADDXri: + case AArch64::ADDXrr: + case AArch64::ADDXrs: + case AArch64::ADDXrx: + case AArch64::ANDXri: + case AArch64::ANDXrr: + case AArch64::ANDXrs: + case AArch64::BICXrr: + case AArch64::BICXrs: + case AArch64::SUBXri: + case AArch64::SUBXrr: + case AArch64::SUBXrs: + case AArch64::SUBXrx: + IsFlagSetting = false; + case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ADDSXrs: + case AArch64::ADDSXrx: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSXrr: + case AArch64::BICSXrs: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::SUBSXrs: + case AArch64::SUBSXrx: + switch (MIOpc) { + default: + llvm_unreachable("Unexpected opcode!"); + + case AArch64::CBZX: + case AArch64::CBNZX: + case AArch64::TBZX: + case AArch64::TBNZX: { + // Check to see if the TBZ/TBNZ is checking the sign bit. + if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) && + MI.getOperand(1).getImm() != 63) + return false; + // There must not be any instruction between DefMI and MI that clobbers or + // reads NZCV. + MachineBasicBlock::iterator I(DefMI), E(MI); + for (I = std::next(I); I != E; ++I) { + if (I->modifiesRegister(AArch64::NZCV, TRI) || + I->readsRegister(AArch64::NZCV, TRI)) + return false; + } + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(DefMI.print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MI.print(dbgs())); + + NewCmp = convertToFlagSetting(DefMI, IsFlagSetting); + NewBr = convertToCondBr(MI); + break; + } + } + break; + } + assert(NewCmp && NewBr && "Expected new instructions."); + + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(NewCmp->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(NewBr->print(dbgs())); + + // If this was a flag setting version of the instruction, we use the original + // instruction by just clearing the dead marked on the implicit-def of NCZV. + // Therefore, we should not erase this instruction. + if (!IsFlagSetting) + DefMI.eraseFromParent(); + MI.eraseFromParent(); + return true; +} + +bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning **********\n" + << "********** Function: " << MF.getName() << '\n'); + + TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + bool LocalChange = false; + for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(), + E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + switch (MI.getOpcode()) { + default: + break; + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + MachineInstr *DefMI = getOperandDef(MI.getOperand(0)); + LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI)); + break; + } + // If the optimization was successful, we can't optimize any other + // branches because doing so would clobber the NZCV flags. + if (LocalChange) { + Changed = true; + break; + } + } + } + return Changed; +} + +FunctionPass *llvm::createAArch64CondBrTuning() { + return new AArch64CondBrTuning(); +} diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 544f67433fd53..ee54550c9900b 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -13,7 +13,9 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -84,6 +86,51 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n"); continue; } + if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) { + // XZ/WZ for LSE can only be used when acquire semantics are not used, + // LDOPAL WZ is an invalid opcode. + switch (MI.getOpcode()) { + case AArch64::CASALb: + case AArch64::CASALh: + case AArch64::CASALs: + case AArch64::CASALd: + case AArch64::SWPALb: + case AArch64::SWPALh: + case AArch64::SWPALs: + case AArch64::SWPALd: + case AArch64::LDADDALb: + case AArch64::LDADDALh: + case AArch64::LDADDALs: + case AArch64::LDADDALd: + case AArch64::LDEORALb: + case AArch64::LDEORALh: + case AArch64::LDEORALs: + case AArch64::LDEORALd: + case AArch64::LDSETALb: + case AArch64::LDSETALh: + case AArch64::LDSETALs: + case AArch64::LDSETALd: + case AArch64::LDSMINALb: + case AArch64::LDSMINALh: + case AArch64::LDSMINALs: + case AArch64::LDSMINALd: + case AArch64::LDSMAXALb: + case AArch64::LDSMAXALh: + case AArch64::LDSMAXALs: + case AArch64::LDSMAXALd: + case AArch64::LDUMINALb: + case AArch64::LDUMINALh: + case AArch64::LDUMINALs: + case AArch64::LDUMINALd: + case AArch64::LDUMAXALb: + case AArch64::LDUMAXALh: + case AArch64::LDUMAXALs: + case AArch64::LDUMAXALd: + continue; + default: + break; + } + } const MCInstrDesc &Desc = MI.getDesc(); for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) { MachineOperand &MO = MI.getOperand(I); diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 8c2c0a564c302..04687847c1a30 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -201,7 +201,7 @@ private: bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); - void SelectCMP_SWAP(SDNode *N); + bool SelectCMP_SWAP(SDNode *N); }; } // end anonymous namespace @@ -2609,9 +2609,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) { } /// We've got special pseudo-instructions for these -void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { +bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { unsigned Opcode; EVT MemTy = cast<MemSDNode>(N)->getMemoryVT(); + + // Leave IR for LSE if subtarget supports it. + if (Subtarget->hasLSE()) return false; + if (MemTy == MVT::i8) Opcode = AArch64::CMP_SWAP_8; else if (MemTy == MVT::i16) @@ -2637,6 +2641,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); CurDAG->RemoveDeadNode(N); + + return true; } void AArch64DAGToDAGISel::Select(SDNode *Node) { @@ -2660,8 +2666,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; case ISD::ATOMIC_CMP_SWAP: - SelectCMP_SWAP(Node); - return; + if (SelectCMP_SWAP(Node)) + return; + break; case ISD::READ_REGISTER: if (tryReadRegister(Node)) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 083ca2156598f..2965106fd2708 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10563,11 +10563,20 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + if (Size > 128) return AtomicExpansionKind::None; + // Nand not supported in LSE. + if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; + // Currently leaving And and Sub to LLSC + if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub)) + return AtomicExpansionKind::LLSC; + // Leave 128 bits to LLSC. + return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { + // If subtarget has LSE, leave cmpxchg intact for codegen. + if (Subtarget->hasLSE()) return false; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 71826bec6b11f..de283b70210fe 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -405,3 +405,49 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch), (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, GPR64:$newLo, GPR64:$newHi), []>, Sched<[WriteAtomic]>; + +// v8.1 Atomic instructions: +def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>; + +def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index eea012382150c..314e89bbca863 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1036,7 +1036,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) { /// \brief Return the opcode that does not set flags when possible - otherwise /// return the original opcode. The caller is responsible to do the actual /// substitution and legality checking. -static unsigned convertFlagSettingOpcode(const MachineInstr &MI) { +static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { // Don't convert all compare instructions, because for some the zero register // encoding becomes the sp register. bool MIDefinesZeroReg = false; @@ -1145,7 +1145,7 @@ bool AArch64InstrInfo::optimizeCompareInstr( return true; } unsigned Opc = CmpInstr.getOpcode(); - unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); + unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); if (NewOpc == Opc) return false; const MCInstrDesc &MCID = get(NewOpc); @@ -3318,7 +3318,7 @@ static bool getMaddPatterns(MachineInstr &Root, // When NZCV is live bail out. if (Cmp_NZCV == -1) return false; - unsigned NewOpc = convertFlagSettingOpcode(Root); + unsigned NewOpc = convertToNonFlagSettingOpc(Root); // When opcode can't change bail out. // CHECKME: do we miss any cases for opcode conversion? if (NewOpc == Opc) diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 59f3405fe439a..58e9ce583d44c 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -119,6 +119,44 @@ public: } } + /// \brief Return the opcode that set flags when possible. The caller is + /// responsible for ensuring the opc has a flag setting equivalent. + static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no flag setting equivalent!"); + // 32-bit cases: + case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri; + case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr; + case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs; + case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx; + case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri; + case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr; + case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs; + case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr; + case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs; + case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri; + case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr; + case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs; + case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx; + // 64-bit cases: + case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri; + case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr; + case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs; + case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx; + case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri; + case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr; + case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs; + case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr; + case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs; + case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri; + case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr; + case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs; + case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx; + } + } + + /// Return true if this is a load/store that can be potentially paired/merged. bool isCandidateToMergeOrPair(MachineInstr &MI) const; diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 9243eb91cc1ac..005f2d51e4036 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -795,6 +795,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, int LoadSize = getMemScale(*LoadI); int StoreSize = getMemScale(*StoreI); unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + const MachineOperand &StMO = getLdStRegOp(*StoreI); unsigned StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); @@ -807,7 +808,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // Remove the load, if the destination register of the loads is the same // register for stored value. if (StRt == LdRt && LoadSize == 8) { - StoreI->clearRegisterKills(StRt, TRI); + for (MachineInstr &MI : make_range(StoreI->getIterator(), + LoadI->getIterator())) { + if (MI.killsRegister(StRt, TRI)) { + MI.clearRegisterKills(StRt, TRI); + break; + } + } DEBUG(dbgs() << "Remove load instruction:\n "); DEBUG(LoadI->print(dbgs())); DEBUG(dbgs() << "\n"); @@ -819,7 +826,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) - .addReg(StRt) + .add(StMO) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { // FIXME: Currently we disable this transformation in big-endian targets as @@ -860,14 +867,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), DestReg) - .addReg(StRt) + .add(StMO) .addImm(AndMaskEncoded); } else { BitExtMI = BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), DestReg) - .addReg(StRt) + .add(StMO) .addImm(Immr) .addImm(Imms); } @@ -876,7 +883,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // Clear kill flags between store and load. for (MachineInstr &MI : make_range(StoreI->getIterator(), BitExtMI->getIterator())) - MI.clearRegisterKills(StRt, TRI); + if (MI.killsRegister(StRt, TRI)) { + MI.clearRegisterKills(StRt, TRI); + break; + } DEBUG(dbgs() << "Promoting load by replacing :\n "); DEBUG(StoreI->print(dbgs())); diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp index 3b71d529db59b..ccc9d2ad1b482 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -7,37 +7,27 @@ // //===----------------------------------------------------------------------===// // -// \file This file contains the AArch64 implementation of the DAG scheduling mutation -// to pair instructions back to back. +/// \file This file contains the AArch64 implementation of the DAG scheduling +/// mutation to pair instructions back to back. // //===----------------------------------------------------------------------===// #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/CodeGen/MacroFusion.h" #include "llvm/Target/TargetInstrInfo.h" -#define DEBUG_TYPE "misched" - -STATISTIC(NumFused, "Number of instr pairs fused"); - using namespace llvm; -static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - namespace { -/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused -/// together. Given an anchor instr, when the other instr is unspecified, then -/// check if the anchor instr may be part of a fused pair at all. +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, const TargetSubtargetInfo &TSI, const MachineInstr *FirstMI, - const MachineInstr *SecondMI) { - assert((FirstMI || SecondMI) && "At least one instr must be specified"); - + const MachineInstr &SecondMI) { const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII); const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI); @@ -45,9 +35,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, unsigned FirstOpcode = FirstMI ? FirstMI->getOpcode() : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); - unsigned SecondOpcode = - SecondMI ? SecondMI->getOpcode() - : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); + unsigned SecondOpcode = SecondMI.getOpcode(); if (ST.hasArithmeticBccFusion()) // Fuse CMN, CMP, TST followed by Bcc. @@ -128,158 +116,49 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, if (ST.hasFuseAES()) // Fuse AES crypto operations. - switch(FirstOpcode) { + switch(SecondOpcode) { // AES encode. - case AArch64::AESErr: - return SecondOpcode == AArch64::AESMCrr || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + case AArch64::AESMCrr : + return FirstOpcode == AArch64::AESErr || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; // AES decode. - case AArch64::AESDrr: - return SecondOpcode == AArch64::AESIMCrr || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + case AArch64::AESIMCrr: + return FirstOpcode == AArch64::AESDrr || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; } if (ST.hasFuseLiterals()) // Fuse literal generation operations. - switch (FirstOpcode) { + switch (SecondOpcode) { // PC relative address. - case AArch64::ADRP: - return SecondOpcode == AArch64::ADDXri || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + case AArch64::ADDXri: + return FirstOpcode == AArch64::ADRP || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; // 32 bit immediate. - case AArch64::MOVZWi: - return (SecondOpcode == AArch64::MOVKWi && - SecondMI->getOperand(3).getImm() == 16) || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; - // Lower half of 64 bit immediate. - case AArch64::MOVZXi: - return (SecondOpcode == AArch64::MOVKXi && - SecondMI->getOperand(3).getImm() == 16) || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; - // Upper half of 64 bit immediate. + case AArch64::MOVKWi: + return (FirstOpcode == AArch64::MOVZWi && + SecondMI.getOperand(3).getImm() == 16) || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; + // Lower and upper half of 64 bit immediate. case AArch64::MOVKXi: - return FirstMI->getOperand(3).getImm() == 32 && - ((SecondOpcode == AArch64::MOVKXi && - SecondMI->getOperand(3).getImm() == 48) || - SecondOpcode == AArch64::INSTRUCTION_LIST_END); + return FirstOpcode == AArch64::INSTRUCTION_LIST_END || + (FirstOpcode == AArch64::MOVZXi && + SecondMI.getOperand(3).getImm() == 16) || + (FirstOpcode == AArch64::MOVKXi && + FirstMI->getOperand(3).getImm() == 32 && + SecondMI.getOperand(3).getImm() == 48); } return false; } -/// \brief Implement the fusion of instr pairs in the scheduling DAG, -/// anchored at the instr in AnchorSU.. -static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { - const MachineInstr *AnchorMI = AnchorSU.getInstr(); - if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient()) - return false; - - // If the anchor instr is the ExitSU, then consider its predecessors; - // otherwise, its successors. - bool Preds = (&AnchorSU == &DAG->ExitSU); - SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs; - - const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI; - const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr; - - // Check if the anchor instr may be fused. - if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), - FirstMI, SecondMI)) - return false; - - // Explorer for fusion candidates among the dependencies of the anchor instr. - for (SDep &Dep : AnchorDeps) { - // Ignore dependencies that don't enforce ordering. - if (Dep.isWeak()) - continue; - - SUnit &DepSU = *Dep.getSUnit(); - // Ignore the ExitSU if the dependents are successors. - if (!Preds && &DepSU == &DAG->ExitSU) - continue; - - const MachineInstr *DepMI = DepSU.getInstr(); - if (!DepMI || DepMI->isPseudo() || DepMI->isTransient()) - continue; - - FirstMI = Preds ? DepMI : AnchorMI; - SecondMI = Preds ? AnchorMI : DepMI; - if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), - FirstMI, SecondMI)) - continue; - - // Create a single weak edge between the adjacent instrs. The only effect is - // to cause bottom-up scheduling to heavily prioritize the clustered instrs. - SUnit &FirstSU = Preds ? DepSU : AnchorSU; - SUnit &SecondSU = Preds ? AnchorSU : DepSU; - DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)); - - // Adjust the latency between the anchor instr and its - // predecessors/successors. - for (SDep &IDep : AnchorDeps) - if (IDep.getSUnit() == &DepSU) - IDep.setLatency(0); - - // Adjust the latency between the dependent instr and its - // successors/predecessors. - for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds) - if (IDep.getSUnit() == &AnchorSU) - IDep.setLatency(0); - - DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; - FirstSU.print(dbgs(), DAG); dbgs() << " - "; - SecondSU.print(dbgs(), DAG); dbgs() << " / "; - dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " << - DAG->TII->getName(SecondMI->getOpcode()) << '\n'; ); - - if (&SecondSU != &DAG->ExitSU) - // Make instructions dependent on FirstSU also dependent on SecondSU to - // prevent them from being scheduled between FirstSU and and SecondSU. - for (SUnit::const_succ_iterator - SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end(); - SI != SE; ++SI) { - if (!SI->getSUnit() || SI->getSUnit() == &SecondSU) - continue; - DEBUG(dbgs() << " Copy Succ "; - SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';); - DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial)); - } - - ++NumFused; - return true; - } - - return false; -} - -/// \brief Post-process the DAG to create cluster edges between instrs that may -/// be fused by the processor into a single operation. -class AArch64MacroFusion : public ScheduleDAGMutation { -public: - AArch64MacroFusion() {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; - -void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - - // For each of the SUnits in the scheduling block, try to fuse the instr in it - // with one in its successors. - for (SUnit &ISU : DAG->SUnits) - scheduleAdjacentImpl(DAG, ISU); - - // Try to fuse the instr in the ExitSU with one in its predecessors. - scheduleAdjacentImpl(DAG, DAG->ExitSU); -} - } // end namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () { - return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr; + return createMacroFusionDAGMutation(shouldScheduleAdjacent); } } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h index e5efedd9fbfd9..32d90d4c40d6f 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.h +++ b/lib/Target/AArch64/AArch64MacroFusion.h @@ -2,23 +2,18 @@ // // The LLVM Compiler Infrastructure // -// \fileThis file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // -// This file contains the AArch64 definition of the DAG scheduling mutation -// to pair instructions back to back. +/// \file This file contains the AArch64 definition of the DAG scheduling +/// mutation to pair instructions back to back. // //===----------------------------------------------------------------------===// -#include "AArch64InstrInfo.h" #include "llvm/CodeGen/MachineScheduler.h" -//===----------------------------------------------------------------------===// -// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - namespace llvm { /// Note that you have to add: diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 9b3899e0681cf..69124dbd0f838 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -469,10 +469,6 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), /*NumOperands*/ 2); } - case TargetOpcode::G_SEQUENCE: - // FIXME: support this, but the generic code is really not going to do - // anything sane. - return getInvalidInstructionMapping(); default: break; } diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td index 303398ea0b7f3..5d1608ef04afa 100644 --- a/lib/Target/AArch64/AArch64SchedA57.td +++ b/lib/Target/AArch64/AArch64SchedA57.td @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// The Cortex-A57 is a traditional superscaler microprocessor with a +// The Cortex-A57 is a traditional superscalar microprocessor with a // conservative 3-wide in-order stage for decode and dispatch. Combined with the // much wider out-of-order issue stage, this produced a need to carefully // schedule micro-ops so that all three decoded each cycle are successfully diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 3d737402022d8..0aeb1f3e30584 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -32,8 +32,8 @@ //===----------------------------------------------------------------------===// // Define 0 micro-op types -def FalkorWr_none_1cyc : SchedWriteRes<[]> { - let Latency = 1; +def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> { + let Latency = 3; let NumMicroOps = 0; } def FalkorWr_none_3cyc : SchedWriteRes<[]> { @@ -505,7 +505,8 @@ def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, let NumMicroOps = 12; } -// Forwarding logic is modeled for multiply add/accumulate. +// Forwarding logic is modeled for multiply add/accumulate and +// load/store base register increment. // ----------------------------------------------------------------------------- def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; @@ -513,9 +514,13 @@ def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; +def FalkorReadIncLd : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>; +def FalkorReadIncSt : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>; + // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast // ----------------------------------------------------------------------------- -def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; +def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).isImm() && + MI->getOperand(1).getImm() == 0}]>; def FalkorOp1ZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || MI->getOperand(1).getReg() == AArch64::XZR}]>; @@ -770,84 +775,113 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], // SIMD Load Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instrs LD2i64)>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instrs LD2i64_POST)>; - -def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD3i64_POST)>; -def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD4i64_POST)>; - -def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instrs LD3Threev2d_POST)>; -def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instrs LD4Fourv2d_POST)>; -def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "^LD3Threev(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc], - (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc], - (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD3Threev(16b|8h|4s)$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>; - -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc], - (instregex "^LD3Threev(16b|8h|4s)_POST$")>; - -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc], - (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instrs LD2i64)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instrs LD2i64_POST)>; + +def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD1i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD3i64)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instrs LD3i64_POST)>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD4i64)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instrs LD4i64_POST)>; + +def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD2i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instrs LD3Threev2d)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd], + (instrs LD3Threev2d_POST)>; +def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd], + (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD3i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], + (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instrs LD4Fourv2d)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd], + (instrs LD4Fourv2d_POST)>; +def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd], + (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD4i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(8b|4h|2s)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(8b|4h|2s)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(8b|4h|2s)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(16b|8h|4s)_POST$")>; + +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; // Arithmetic and Logical Instructions // ----------------------------------------------------------------------------- @@ -929,87 +963,105 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; // SIMD Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STR(Q|D|S|H|B)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^STR(Q|D|S|H|B)(post|pre)$")>; -def : InstRW<[FalkorWr_STRVro], (instregex "^STR(D|S|H|B)ro(W|X)$")>; -def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc], +def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt], + (instregex "^STR(D|S|H|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STPQi$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], (instregex "^STPQ(post|pre)$")>; -def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STP(D|S)(i)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], (instregex "^STP(D|S)(post|pre)$")>; -def : InstRW<[FalkorWr_STRQro], (instregex "^STRQro(W|X)$")>; -def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>; -def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>; -def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>; - -def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], +def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt], + (instregex "^STRQro(W|X)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STUR(Q|D|S|B|H)i$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instrs STNPDi, STNPSi)>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instrs STNPQi)>; + +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>; -def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc], +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc], - (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; - -def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; -def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>; -def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>; + +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4(i8|i16|i32|i64)$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST3(i8|i16|i32|i64)_POST$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST4(i8|i16|i32|i64)_POST$")>; -def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc], - (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>; +def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3Three(v8b|v4h|v2s)$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc], - (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>; -def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>; +def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], + (instrs ST3Threev2d)>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], (instrs ST3Threev2d_POST)>; -def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc], - (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>; +def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4Four(v8b|v4h|v2s)$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc], - (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>; -def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>; +def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], + (instrs ST4Fourv2d)>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], (instrs ST4Fourv2d_POST)>; -def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc], +def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST3Three(v16b|v8h|v4s)$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; -def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc], +def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST4Four(v16b|v8h|v4s)$")>; // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). -def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc], +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; // Branch Instructions @@ -1033,22 +1085,25 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>; // FP Load Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(Q|D|S|H|B)i$")>; -def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; -def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDUR(Q|D|S|H|B)i$")>; +def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd], + (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instrs LDNPQi)>; -def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instrs LDPQi)>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "LDNP(D|S)i$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "LDP(D|S)i$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "LDP(D|S)(pre|post)$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "^LDPQ(pre|post)$")>; // FP Data Processing Instructions @@ -1106,31 +1161,41 @@ def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>; -def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "^LDNP(W|X)i$")>; -def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "^LDP(W|X)i$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], (instregex "^LDP(W|X)(post|pre)$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(BB|HH|W|X)ui$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; -def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(W|X)l$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDTR(B|H|W|X)i$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(BB|HH|W|X)i$")>; +def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd], + (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR(W|X)l$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDUR(BB|HH|W|X)i$")>; def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>; -def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc], +def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd], (instrs LDPSWi)>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc], +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd], (instregex "^LDPSW(post|pre)$")>; -def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc], +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd], (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; -def : InstRW<[FalkorWr_LDRSro], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; -def : InstRW<[FalkorWr_1LD_4cyc], (instrs LDRSWl)>; -def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; -def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; +def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd], + (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instrs LDRSWl)>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; // Miscellaneous Data-Processing Instructions // ----------------------------------------------------------------------------- @@ -1178,32 +1243,46 @@ def : InstRW<[FalkorWr_1none_0cyc], (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HL def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>; def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>; def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>; def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; -def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instrs STNPWi, STNPXi)>; def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>; -def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>; -def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>; -def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>; - -def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>; -def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STXP(W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STXR(B|H|W|X)$")>; + +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STLXP(W|X)$")>; +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STLXR(B|H|W|X)$")>; // Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STP(W|X)i$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc], +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STP(W|X)i$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], (instregex "^STP(W|X)(post|pre)$")>; -def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STR(BB|HH|W|X)ui$")>; -def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc], +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], (instregex "^STR(BB|HH|W|X)(post|pre)$")>; -def : InstRW<[FalkorWr_STRro], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; -def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STTR(B|H|W|X)i$")>; -def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STUR(BB|HH|W|X)i$")>; +def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt], + (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STUR(BB|HH|W|X)i$")>; diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td index 02cccccd3078c..cf4cdabb8cbfc 100644 --- a/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -1374,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln : let Latency = 3; let NumMicroOps = 2; } def : InstRW<[KryoWrite_3cyc_LS_LS_400ln], - (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>; + (instregex "LDAX?R(B|H|W|X)")>; +def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi], + (instregex "LDAXP(W|X)")>; def KryoWrite_3cyc_LS_LS_401ln : SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { let Latency = 3; let NumMicroOps = 2; @@ -1565,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln : SchedWriteRes<[KryoUnitLS]> { let Latency = 3; let NumMicroOps = 1; } -def : InstRW<[KryoWrite_3cyc_LS_258ln], +def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi], (instregex "LDXP(W|X)")>; def KryoWrite_3cyc_LS_258_1ln : SchedWriteRes<[KryoUnitLS]> { diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index d4a8cecdb29f1..6660f0babb8a6 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -47,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp", cl::desc("Enable the CCMP formation pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableCondBrTuning("aarch64-enable-cond-br-tune", + cl::desc("Enable the conditional branch tuning pass"), + cl::init(true), cl::Hidden); + static cl::opt<bool> EnableMCR("aarch64-enable-mcr", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); @@ -429,6 +434,8 @@ bool AArch64PassConfig::addILPOpts() { addPass(createAArch64ConditionalCompares()); if (EnableMCR) addPass(&MachineCombinerID); + if (EnableCondBrTuning) + addPass(createAArch64CondBrTuning()); if (EnableEarlyIfConversion) addPass(&EarlyIfConverterID); if (EnableStPairSuppress) diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index f0f50f29be0f3..02b12b5e90ca2 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -43,6 +43,7 @@ add_llvm_target(AArch64CodeGen AArch64AsmPrinter.cpp AArch64CleanupLocalDynamicTLSPass.cpp AArch64CollectLOH.cpp + AArch64CondBrTuning.cpp AArch64ConditionalCompares.cpp AArch64DeadRegisterDefinitionsPass.cpp AArch64ExpandPseudoInsts.cpp diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 43a6fa9ce0896..3d075018904c0 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -43,26 +43,25 @@ public: const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = { - // This table *must* be in the order that the fixup_* kinds are defined in - // AArch64FixupKinds.h. - // - // Name Offset (bits) Size (bits) Flags - { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal }, - { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal }, - { "fixup_aarch64_add_imm12", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 }, - { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal }, - { "fixup_aarch64_movw", 5, 16, 0 }, - { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal }, - { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal }, - { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal }, - { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal }, - { "fixup_aarch64_tlsdesc_call", 0, 0, 0 } - }; + // This table *must* be in the order that the fixup_* kinds are defined + // in AArch64FixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal}, + {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal}, + {"fixup_aarch64_add_imm12", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0}, + {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal}, + {"fixup_aarch64_movw", 5, 16, 0}, + {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal}, + {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal}, + {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal}, + {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}, + {"fixup_aarch64_tlsdesc_call", 0, 0, 0}}; if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -72,8 +71,9 @@ public: return Infos[Kind - FirstTargetFixupKind]; } - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; bool mayNeedRelaxation(const MCInst &Inst) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -261,13 +261,15 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con } } -void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); if (!Value) return; // Doesn't change encoding. MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); + MCContext &Ctx = Asm.getContext(); // Apply any target-specific value adjustments. Value = adjustFixupValue(Fixup, Value, Ctx); @@ -275,7 +277,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); @@ -289,7 +291,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } } else { // Handle as big-endian - assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!"); + assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!"); assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = FulleSizeInBytes - 1 - i; @@ -539,16 +541,14 @@ public: return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32); } - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; + void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, bool &IsResolved) override; }; -void ELFAArch64AsmBackend::processFixupValue( - const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup, - const MCFragment *DF, const MCValue &Target, uint64_t &Value, - bool &IsResolved) { +void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target, + bool &IsResolved) { // The ADRP instruction adds some multiple of 0x1000 to the current PC & // ~0xfff. This means that the required offset to reach a symbol can vary by // up to one step depending on where the ADRP is in memory. For example: diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 6ab2b9ef04598..7494e5decd6f6 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa", "Support SDWA (Sub-DWORD Addressing) extension" >; +def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod", + "HasSDWAOmod", + "true", + "Support OMod with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar", + "HasSDWAScalar", + "true", + "Support scalar register with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst", + "HasSDWASdst", + "true", + "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", + "HasSDWAMac", + "true", + "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc", + "HasSDWAClampVOPC", + "true", + "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + def FeatureDPP : SubtargetFeature<"dpp", "HasDPP", "true", @@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, - FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, - FeatureDPP + FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP ] >; @@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureSDWA, FeatureDPP, + FeatureFastFMAF32, FeatureDPP, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; @@ -449,14 +480,14 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping, def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0, [FeatureSouthernIslands, - FeatureFastFMAF32, + FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32]>; def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1, [FeatureSouthernIslands, FeatureLDSBankCount32]>; - + def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, [FeatureSeaIslands, FeatureLDSBankCount32]>; @@ -644,7 +675,11 @@ def isCIVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<"FeatureCIInsts">; -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, + AssemblerPredicate<"FeatureFlatAddressSpace">; + +def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, + AssemblerPredicate<"FeatureFlatGlobalInsts">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<"Feature16BitInsts">; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5586b513b5fca..96f819fd0e684 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3527,18 +3527,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, //===----------------------------------------------------------------------===// SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { + const TargetRegisterClass *RC, + unsigned Reg, EVT VT, + const SDLoc &SL, + bool RawReg) const { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned VirtualRegister; + unsigned VReg; + if (!MRI.isLiveIn(Reg)) { - VirtualRegister = MRI.createVirtualRegister(RC); - MRI.addLiveIn(Reg, VirtualRegister); + VReg = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VReg); } else { - VirtualRegister = MRI.getLiveInVirtReg(Reg); + VReg = MRI.getLiveInVirtReg(Reg); } - return DAG.getRegister(VirtualRegister, VT); + + if (RawReg) + return DAG.getRegister(VReg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( @@ -3657,6 +3664,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0d066cdbdff4d..a45234e2b39f2 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -216,10 +216,25 @@ public: /// \brief Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; + /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise + /// a copy from the register. + SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT, + const SDLoc &SL, + bool RawReg = false) const; + SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode())); + } + + // Returns the raw live in register rather than a copy from it. + SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); + } enum ImplicitParameter { FIRST_IMPLICIT, @@ -388,6 +403,8 @@ enum NodeType : unsigned { STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + TBUFFER_STORE_FORMAT_X3, + TBUFFER_LOAD_FORMAT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index a01f5d37c7c16..69dc529861729 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, - VI = 1 + VI = 1, + SDWA = 2, + SDWA9 = 3 }; // Wrapper for Tablegen'd function. enum Subtarget is not defined in any @@ -101,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST)); + SIEncodingFamily Gen = subtargetEncodingFamily(ST); + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) + Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 + : SIEncodingFamily::SDWA; + + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. if (MCOp == -1) diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index e286558ce60d7..bcf89bb78ad66 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -70,6 +70,10 @@ def AMDGPUElseBreakOp : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>] >; +def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -179,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; // out = (src1 > src0) ? 1 : 0 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; +// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own +// nodes in TargetSelectionDAG.td. +def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>; + +def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>; + def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 790a69b843979..cc56216c355bf 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -29,12 +29,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { using namespace TargetOpcode; const LLT S1= LLT::scalar(1); + const LLT V2S16 = LLT::vector(2, 16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); const LLT P1 = LLT::pointer(1, 64); const LLT P2 = LLT::pointer(2, 64); setAction({G_ADD, S32}, Legal); + setAction({G_AND, S32}, Legal); + + setAction({G_BITCAST, V2S16}, Legal); + setAction({G_BITCAST, 1, S32}, Legal); + + setAction({G_BITCAST, S32}, Legal); + setAction({G_BITCAST, 1, V2S16}, Legal); // FIXME: i1 operands to intrinsics should always be legal, but other i1 // values may not be legal. We need to figure out how to distinguish @@ -61,6 +69,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_SELECT, S32}, Legal); setAction({G_SELECT, 1, S1}, Legal); + setAction({G_SHL, S32}, Legal); + setAction({G_STORE, S32}, Legal); setAction({G_STORE, 1, P1}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 8d157e2f98f24..ab5abf2039a5b 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasScalarStores(false), HasInv2PiInlineImm(false), HasSDWA(false), + HasSDWAOmod(false), + HasSDWAScalar(false), + HasSDWASdst(false), + HasSDWAMac(false), + HasSDWAClampVOPC(false), HasDPP(false), FlatAddressSpace(false), FlatInstOffsets(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 5f4f20316a6ba..2b16289c723ef 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -149,6 +149,11 @@ protected: bool HasScalarStores; bool HasInv2PiInlineImm; bool HasSDWA; + bool HasSDWAOmod; + bool HasSDWAScalar; + bool HasSDWASdst; + bool HasSDWAMac; + bool HasSDWAClampVOPC; bool HasDPP; bool FlatAddressSpace; bool FlatInstOffsets; @@ -431,6 +436,26 @@ public: return HasSDWA; } + bool hasSDWAOmod() const { + return HasSDWAOmod; + } + + bool hasSDWAScalar() const { + return HasSDWAScalar; + } + + bool hasSDWASdst() const { + return HasSDWASdst; + } + + bool hasSDWAMac() const { + return HasSDWAMac; + } + + bool hasSDWAClampVOPC() const { + return HasSDWAClampVOPC; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b644eba536fa4..04fe9f689806c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -342,6 +342,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { PM.add(createAMDGPUExternalAAWrapperPass()); } }); + + Builder.addExtension( + PassManagerBuilder::EP_CGSCCOptimizerLate, + [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + // Add infer address spaces pass to the opt pipeline after inlining + // but before SROA to increase SROA opportunities. + PM.add(createInferAddressSpacesPass()); + }); } //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 0d6689bd04c4e..88245b01683a5 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -184,9 +184,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, } } -unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { - if (Vec) - return 0; +unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { + // The concept of vector registers doesn't really exist. Some packed vector + // operations operate on the normal 32-bit registers. // Number of VGPRs on SI. if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -195,8 +195,18 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { + // This is really the number of registers to fill when vectorizing / + // interleaving loops, so we lie to avoid trying to use all registers. + return getHardwareNumberOfRegisters(Vec) >> 3; +} + unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const { - return Vector ? 0 : 32; + return 32; +} + +unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { + return 32; } unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { @@ -247,11 +257,11 @@ bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Disable unrolling if the loop is not vectorized. + // TODO: Enable this again. if (VF == 1) return 1; - // Semi-arbitrary large amount. - return 64; + return 8; } int AMDGPUTTIImpl::getArithmeticInstrCost( diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index a60b1bb1b59c7..485e20411ab49 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -75,8 +75,10 @@ public: return TTI::PSK_FastHardware; } - unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector) const; + unsigned getHardwareNumberOfRegisters(bool Vector) const; + unsigned getNumberOfRegisters(bool Vector) const; + unsigned getRegisterBitWidth(bool Vector) const ; + unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 392e9d89bd9ba..7b8756050b752 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -152,6 +152,8 @@ public: ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, + ImmTyDFMT, + ImmTyNFMT, ImmTyHwreg, ImmTyOff, ImmTySendMsg, @@ -260,6 +262,8 @@ public: return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); } + bool isSDWARegKind() const; + bool isImmTy(ImmTy ImmT) const { return isImm() && Imm.Type == ImmT; } @@ -292,6 +296,8 @@ public: bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } + bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); } + bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); } bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } @@ -636,6 +642,8 @@ public: case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; + case ImmTyDFMT: OS << "DFMT"; break; + case ImmTyNFMT: OS << "NFMT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; @@ -993,7 +1001,9 @@ private: void errorExpTgt(); OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); - bool validateOperandLimitations(const MCInst &Inst); + bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc); + bool validateConstantBusLimitations(const MCInst &Inst); + bool validateEarlyClobberLimitations(const MCInst &Inst); bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; @@ -1029,6 +1039,8 @@ public: void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } + void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultSLC() const; AMDGPUOperand::Ptr defaultTFE() const; @@ -1042,6 +1054,7 @@ public: AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; AMDGPUOperand::Ptr defaultOffsetU12() const; + AMDGPUOperand::Ptr defaultOffsetS13() const; OperandMatchResultTy parseOModOperand(OperandVector &Operands); @@ -1243,6 +1256,15 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } +bool AMDGPUOperand::isSDWARegKind() const { + if (AsmParser->isVI()) + return isVReg(); + else if (AsmParser->isGFX9()) + return isRegKind(); + else + return false; +} + uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const { assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); @@ -2083,7 +2105,7 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); } -bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { +bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { const unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); unsigned ConstantBusUseCount = 0; @@ -2137,6 +2159,60 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { return ConstantBusUseCount <= 1; } +bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) { + + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + + const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + if (DstIdx == -1 || + Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) { + return true; + } + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + assert(DstIdx != -1); + const MCOperand &Dst = Inst.getOperand(DstIdx); + assert(Dst.isReg()); + const unsigned DstReg = mc2PseudoReg(Dst.getReg()); + + const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + for (int SrcIdx : SrcIndices) { + if (SrcIdx == -1) break; + const MCOperand &Src = Inst.getOperand(SrcIdx); + if (Src.isReg()) { + const unsigned SrcReg = mc2PseudoReg(Src.getReg()); + if (isRegIntersect(DstReg, SrcReg, TRI)) { + return false; + } + } + } + + return true; +} + +bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, + const SMLoc &IDLoc) { + if (!validateConstantBusLimitations(Inst)) { + Error(IDLoc, + "invalid operand (violates constant bus restrictions)"); + return false; + } + if (!validateEarlyClobberLimitations(Inst)) { + Error(IDLoc, + "destination must be different than all sources"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2169,9 +2245,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (Result) { default: break; case Match_Success: - if (!validateOperandLimitations(Inst)) { - return Error(IDLoc, - "invalid operand (violates constant bus restrictions)"); + if (!validateInstruction(Inst, IDLoc)) { + return true; } Inst.setLoc(IDLoc); Out.EmitInstruction(Inst, getSTI()); @@ -2554,11 +2629,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { return MatchOperand_ParseFail; Parser.Lex(); + + bool IsMinus = false; + if (getLexer().getKind() == AsmToken::Minus) { + Parser.Lex(); + IsMinus = true; + } + if (getLexer().isNot(AsmToken::Integer)) return MatchOperand_ParseFail; if (getParser().parseAbsoluteExpression(Int)) return MatchOperand_ParseFail; + + if (IsMinus) + Int = -Int; break; } } @@ -3743,6 +3828,44 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } +void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); +} + //===----------------------------------------------------------------------===// // mimg //===----------------------------------------------------------------------===// @@ -3870,6 +3993,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + //===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -3919,6 +4046,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, + {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, + {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, @@ -4475,12 +4604,11 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { - // V_NOP_sdwa_vi has no optional sdwa arguments + // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - if (isGFX9() && - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); @@ -4490,8 +4618,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, case SIInstrFlags::VOP2: addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - if (isGFX9() && - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); @@ -4501,9 +4628,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, break; case SIInstrFlags::VOPC: - if (isVI()) { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 2aca65ac84303..2e96c14eaa320 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -57,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { string OpName = NAME # suffix; } +class MTBUFAddr64Table <bit is_addr64, string suffix = ""> { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + //===----------------------------------------------------------------------===// // MTBUF classes //===----------------------------------------------------------------------===// @@ -78,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, let EXP_CNT = 1; let MTBUF = 1; let Uses = [EXEC]; - let hasSideEffects = 0; let SchedRW = [WriteVMEM]; + + let AsmMatchConverter = "cvtMtbuf"; + + bits<1> offen = 0; + bits<1> idxen = 0; + bits<1> addr64 = 0; + bits<1> has_vdata = 1; + bits<1> has_vaddr = 1; + bits<1> has_glc = 1; + bits<1> glc_value = 0; // the value for glc if no such operand + bits<4> dfmt_value = 1; // the value for dfmt if no such operand + bits<3> nfmt_value = 0; // the value for nfmt if no such operand + bits<1> has_srsrc = 1; + bits<1> has_soffset = 1; + bits<1> has_offset = 1; + bits<1> has_slc = 1; + bits<1> has_tfe = 1; + bits<1> has_dfmt = 1; + bits<1> has_nfmt = 1; } class MTBUF_Real <MTBUF_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, - Enc64 { + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -97,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> : let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; - bits<8> vdata; bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class getMTBUFInsDA<list<RegisterClass> vdataList, + list<RegisterClass> vaddrList=[]> { + RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); + RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + dag InsNoData = !if(!empty(vaddrList), + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe), + (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe) + ); + dag InsData = !if(!empty(vaddrList), + (ins vdataClass:$vdata, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + slc:$slc, tfe:$tfe) + ); + dag ret = !if(!empty(vdataList), InsNoData, InsData); } -class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo < - opName, (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), - " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# - " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { +class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> { + dag ret = + !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret, + (ins)))))); +} + +class getMTBUFAsmOps<int addrKind> { + string Pfx = + !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset", + !if(!eq(addrKind, BUFAddrKind.OffEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen", + !if(!eq(addrKind, BUFAddrKind.IdxEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen", + !if(!eq(addrKind, BUFAddrKind.BothEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen", + !if(!eq(addrKind, BUFAddrKind.Addr64), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64", + ""))))); + string ret = Pfx # "$offset"; +} + +class MTBUF_SetupAddr<int addrKind> { + bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0); + + bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1); +} + +class MTBUF_Load_Pseudo <string opName, + int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind> + : MTBUF_Pseudo<opName, + (outs vdataClass:$vdata), + getMTBUFIns<addrKindCopy>.ret, + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + pattern>, + MTBUF_SetupAddr<addrKindCopy> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 1; let mayStore = 0; } -class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo < - opName, (outs), - (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), - " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# - " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { +multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { + + def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(set load_vt:$vdata, + (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt, + i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, + MTBUFAddr64Table<0>; + + def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(set load_vt:$vdata, + (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, + i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, + MTBUFAddr64Table<1>; + + def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + let DisableWQM = 1 in { + def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; + def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } +} + +class MTBUF_Store_Pseudo <string opName, + int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind, + RegisterClass vdataClassCopy = vdataClass> + : MTBUF_Pseudo<opName, + (outs), + getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + pattern>, + MTBUF_SetupAddr<addrKindCopy> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; } +multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, + ValueType store_vt = i32, + SDPatternOperator st = null_frag> { + + def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, + i1:$slc, i1:$tfe))]>, + MTBUFAddr64Table<0>; + + def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, + i1:$slc, i1:$tfe))]>, + MTBUFAddr64Table<1>; + + def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + let DisableWQM = 1 in { + def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; + def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } +} + + //===----------------------------------------------------------------------===// // MUBUF classes //===----------------------------------------------------------------------===// @@ -676,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", // MTBUF Instructions //===----------------------------------------------------------------------===// -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>; -def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>; -def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>; -def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>; -def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>; -def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; } // End let SubtargetPredicate = isGCN @@ -1093,22 +1226,98 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF // MTBUF Patterns //===----------------------------------------------------------------------===// -// TBUFFER_STORE_FORMAT_*, addr64=0 -class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat< - (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, - i32:$soffset, imm:$inst_offset, imm:$dfmt, - imm:$nfmt, imm:$offen, imm:$idxen, - imm:$glc, imm:$slc, imm:$tfe), - (opcode - $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), - (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, - (as_i1imm $slc), (as_i1imm $tfe), $soffset) ->; +//===----------------------------------------------------------------------===// +// tbuffer_load/store_format patterns +//===----------------------------------------------------------------------===// + +multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; -def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>; -def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; -def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; -def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; + +multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, + imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; } // End let Predicates = [isGCN] @@ -1224,21 +1433,44 @@ def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>; class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> : MTBUF_Real<ps>, + Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> { let AssemblerPredicate=isSICI; let DecoderNamespace="SICI"; - bits<1> addr64; - let Inst{15} = addr64; + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{15} = ps.addr64; let Inst{18-16} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>; -def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>; -def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>; -def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>; -def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>; +multiclass MTBUF_Real_AllAddr_si<bits<3> op> { + def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>; + def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; +} +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>; //===----------------------------------------------------------------------===// // CI @@ -1350,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : MTBUF_Real<ps>, + Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { let AssemblerPredicate=isVI; let DecoderNamespace="VI"; + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>; -def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>; -def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>; -def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>; -def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>; +multiclass MTBUF_Real_AllAddr_vi<bits<4> op> { + def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; +} +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 88c92b9582fd0..04308fb3aaf64 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) { MCDisassembler::SoftFail; } +static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op, + uint16_t NameIdx) { + int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx); + if (OpIdx != -1) { + auto I = MI.begin(); + std::advance(I, OpIdx); + MI.insert(I, Op); + } + return OpIdx; +} + static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); @@ -106,12 +117,12 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } -#define DECODE_SDWA9(DecName) \ -DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName) +#define DECODE_SDWA(DecName) \ +DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) -DECODE_SDWA9(Src32) -DECODE_SDWA9(Src16) -DECODE_SDWA9(VopcDst) +DECODE_SDWA(Src32) +DECODE_SDWA(Src16) +DECODE_SDWA(VopcDst) #include "AMDGPUGenDisassemblerTables.inc" @@ -149,6 +160,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, raw_ostream &WS, raw_ostream &CS) const { CommentStream = &CS; + bool IsSDWA = false; // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) @@ -170,10 +182,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res) break; Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); - if (Res) break; + if (Res) { IsSDWA = true; break; } Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); - if (Res) break; + if (Res) { IsSDWA = true; break; } } // Reinitialize Bytes as DPP64 could have eaten too much @@ -200,17 +212,36 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) { // Insert dummy unused src2_modifiers. - int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::src2_modifiers); - auto I = MI.begin(); - std::advance(I, Src2ModIdx); - MI.insert(I, MCOperand::createImm(0)); + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); } + if (Res && IsSDWA) + Res = convertSDWAInst(MI); + Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; return Res; } +DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1) + // VOPC - insert clamp + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); + } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst); + if (SDst != -1) { + // VOPC - insert VCC register as sdst + insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC), + AMDGPU::OpName::sdst); + } else { + // VOP1/2 - insert omod if present in instruction + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); + } + } + return MCDisassembler::Success; +} + const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { return getContext().getRegisterInfo()-> getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]); @@ -524,8 +555,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); } - assert(Width == OPW16 || Width == OPW32 || Width == OPW64); - if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) return decodeIntImmed(Val); @@ -592,36 +621,43 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } -MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width, - unsigned Val) const { +MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, + unsigned Val) const { using namespace AMDGPU::SDWA; - if (SDWA9EncValues::SRC_VGPR_MIN <= Val && - Val <= SDWA9EncValues::SRC_VGPR_MAX) { - return createRegOperand(getVgprClassId(Width), - Val - SDWA9EncValues::SRC_VGPR_MIN); - } - if (SDWA9EncValues::SRC_SGPR_MIN <= Val && - Val <= SDWA9EncValues::SRC_SGPR_MAX) { - return createSRegOperand(getSgprClassId(Width), - Val - SDWA9EncValues::SRC_SGPR_MIN); - } + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), + Val - SDWA9EncValues::SRC_VGPR_MIN); + } + if (SDWA9EncValues::SRC_SGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_SGPR_MAX) { + return createSRegOperand(getSgprClassId(Width), + Val - SDWA9EncValues::SRC_SGPR_MIN); + } - return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); + return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); + } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + return createRegOperand(getVgprClassId(Width), Val); + } + llvm_unreachable("unsupported target"); } -MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const { - return decodeSDWA9Src(OPW16, Val); +MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const { + return decodeSDWASrc(OPW16, Val); } -MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const { - return decodeSDWA9Src(OPW32, Val); +MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { + return decodeSDWASrc(OPW32, Val); } -MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const { +MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { using namespace AMDGPU::SDWA; + assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] && + "SDWAVopcDst should be present only on GFX9"); if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; if (Val > AMDGPU::EncValues::SGPR_MAX) { diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 5fa3cf1a223fa..3d71db909e20d 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -65,6 +65,8 @@ public: uint64_t Inst, uint64_t Address) const; + DecodeStatus convertSDWAInst(MCInst &MI) const; + MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; @@ -105,10 +107,10 @@ public: MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; - MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const; - MCOperand decodeSDWA9Src16(unsigned Val) const; - MCOperand decodeSDWA9Src32(unsigned Val) const; - MCOperand decodeSDWA9VopcDst(unsigned Val) const; + MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWASrc16(unsigned Val) const; + MCOperand decodeSDWASrc32(unsigned Val) const; + MCOperand decodeSDWAVopcDst(unsigned Val) const; }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 98eda288bcacb..edca6fcd812c8 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -31,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, let VM_CNT = 1; let LGKM_CNT = 1; - let Uses = [EXEC, FLAT_SCR]; // M0 - let UseNamedOperandTable = 1; let hasSideEffects = 0; let SchedRW = [WriteVMEM]; @@ -40,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, string Mnemonic = opName; string AsmOperands = asmOps; + bits<1> is_flat_global = 0; + bits<1> is_flat_scratch = 0; + bits<1> has_vdst = 1; bits<1> has_data = 1; bits<1> has_glc = 1; bits<1> glcValue = 0; + + // TODO: M0 if it could possibly access LDS (before gfx9? only)? + let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]); } class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : @@ -68,7 +72,10 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // Only valid on gfx9 bits<1> lds = 0; // XXX - What does this actually do? - bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved + + // Segment, 00=flat, 01=scratch, 10=global, 11=reserved + bits<2> seg = !if(ps.is_flat_global, 0b10, + !if(ps.is_flat_scratch, 0b01, 0)); // Signed offset. Highest bit ignored for flat and treated as 12-bit // unsigned for flat acceses. @@ -81,7 +88,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // Only valid on GFX9+ let Inst{12-0} = offset; let Inst{13} = lds; - let Inst{15-14} = 0; + let Inst{15-14} = seg; let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); let Inst{17} = slc; @@ -106,6 +113,16 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass, let mayLoad = 1; } +class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> : + FLAT_Load_Pseudo<opName, regClass, 1> { + let is_flat_global = 1; +} + +class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> : + FLAT_Load_Pseudo<opName, regClass, 1> { + let is_flat_scratch = 1; +} + class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, bit HasSignedOffset = 0> : FLAT_Pseudo< opName, @@ -119,6 +136,16 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let has_vdst = 0; } +class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> : + FLAT_Store_Pseudo<opName, regClass, 1> { + let is_flat_global = 1; +} + +class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> : + FLAT_Store_Pseudo<opName, regClass, 1> { + let is_flat_scratch = 1; +} + multiclass FLAT_Atomic_Pseudo< string opName, RegisterClass vdst_rc, @@ -306,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", } // End SubtargetPredicate = isCI +let SubtargetPredicate = HasFlatGlobalInsts in { +def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; +def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; +def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; +def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>; +def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>; +def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>; +def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; +def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; + +def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; +def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; +def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; +def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; +def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; +def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; + +} // End SubtargetPredicate = HasFlatGlobalInsts + + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -557,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>; defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>; +def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>; +def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>; +def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>; +def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>; +def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>; +def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>; +def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>; +def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>; + +def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>; +def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>; +def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>; +def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>; +def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>; +def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index b84640230eeeb..7c31c8e397ba7 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } +void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm())); +} + void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -118,6 +123,16 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << ((OpNo == 0)? "offset:" : " offset:"); + printS16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, O << " vm"; } +void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " dfmt:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " nfmt:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI) { switch (RegNo) { @@ -379,7 +412,6 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { uint16_t Lo16 = static_cast<uint16_t>(Imm); - assert(Lo16 == static_cast<uint16_t>(Imm >> 16)); printImmediate16(Lo16, STI, O); } diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index c8094c4b840a1..7bbf99a85f409 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -42,6 +42,7 @@ private: void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, @@ -52,6 +53,9 @@ private: void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -84,6 +88,10 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 0a9c2b94c1eee..2b408ff10caae 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -30,14 +30,9 @@ public: unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; - void processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; - - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { @@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } -void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) { - MCValue Res; - - // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are - // used for long branches, this function will be called with - // IsResolved = false and Value set to some pre-computed value. In - // the example above, the value would be: - // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction. - // This is not what we want. We just want the expression computation - // only. The reason the MC layer subtracts the current offset from the - // expression is because the fixup is of kind FK_PCRel_4. - // For these scenarios, evaluateAsValue gives us the computation that we - // want. - if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) && - Res.isAbsolute()) { - Value = Res.getConstant(); - IsResolved = true; - - } - if (IsResolved) - Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); -} - -void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { + Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); if (!Value) return; // Doesn't change encoding. @@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); uint32_t Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index a856b17a228f0..1b062064ace1c 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,15 +52,15 @@ public: return 0; } - virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { + virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { return 0; } - virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { + virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { return 0; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index e02acf516c0db..376c9bfe5ccf2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -70,13 +70,13 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; - unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -252,9 +252,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { uint16_t Lo16 = static_cast<uint16_t>(Imm); - assert(Lo16 == static_cast<uint16_t>(Imm >> 16)); uint32_t Encoding = getLit16Encoding(Lo16, STI); - assert(Encoding != 255 && "packed constants can only be inline immediates"); return Encoding; } default: @@ -328,11 +326,11 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, } unsigned -SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { using namespace AMDGPU::SDWA; - + uint64_t RegEnc = 0; const MCOperand &MO = MI.getOperand(OpNo); @@ -347,9 +345,9 @@ SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, } unsigned -SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { using namespace AMDGPU::SDWA; uint64_t RegEnc = 0; @@ -365,6 +363,25 @@ SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, return RegEnc; } +static bool needsPCRel(const MCExpr *Expr) { + switch (Expr->getKind()) { + case MCExpr::SymbolRef: + return true; + case MCExpr::Binary: { + auto *BE = cast<MCBinaryExpr>(Expr); + if (BE->getOpcode() == MCBinaryExpr::Sub) + return false; + return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS()); + } + case MCExpr::Unary: + return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr()); + case MCExpr::Target: + case MCExpr::Constant: + return false; + } + llvm_unreachable("invalid kind"); +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, @@ -373,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, return MRI.getEncodingValue(MO.getReg()); if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { - const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); + // FIXME: If this is expression is PCRel or not should not depend on what + // the expression looks like. Given that this is just a general expression, + // it should probably be FK_Data_4 and whatever is producing + // + // s_add_u32 s2, s2, (extern_const_addrspace+16 + // + // And expecting a PCRel should instead produce + // + // .Ltmp1: + // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1 MCFixupKind Kind; - if (Expr && Expr->getSymbol().isExternal()) - Kind = FK_Data_4; - else + if (needsPCRel(MO.getExpr())) Kind = FK_PCRel_4; + else + Kind = FK_Data_4; Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc())); } diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index f6f2582aa11b3..d30d1d382588c 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -80,7 +80,7 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"gfx600", SIFullSpeedModel, +def : ProcessorModel<"gfx600", SIFullSpeedModel, [FeatureISAVersion6_0_0]>; def : ProcessorModel<"SI", SIFullSpeedModel, @@ -95,7 +95,7 @@ def : ProcessorModel<"gfx601", SIQuarterSpeedModel, [FeatureISAVersion6_0_1] >; -def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureISAVersion6_0_1]>; def : ProcessorModel<"verde", SIQuarterSpeedModel, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index c55878f8bff0f..215791f4f92dd 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -584,23 +584,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return LowerImplicitParameter(DAG, VT, DL, 8); case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_X, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_X, VT); case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Y, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Y, VT); case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Z, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Z, VT); case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_X, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_X, VT); case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Y, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Y, VT); case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Z, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Z, VT); case Intrinsic::r600_recipsqrt_ieee: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 5cd90323ff67b..3915c0e5bdbed 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -118,9 +118,9 @@ namespace AMDGPU { // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, - // Operand for GFX9 SDWA instructions - OPERAND_SDWA9_SRC, - OPERAND_SDWA9_VOPC_DST, + // Operand for SDWA instructions + OPERAND_SDWA_SRC, + OPERAND_SDWA_VOPC_DST, /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 5f5f25103c027..0a795c99f94e5 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -174,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } +static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + auto &Src = MI.getOperand(1); + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = Src.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + + for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { + const auto *UseMI = MO.getParent(); + if (UseMI == &MI) + continue; + if (MO.isDef() || UseMI->getParent() != MI.getParent() || + UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || + !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) + return false; + } + // Change VGPR to SGPR destination. + MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); + return true; +} + // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. // // SGPRx = ... @@ -214,6 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) return false; + if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) + return true; + // TODO: Could have multiple extracts? unsigned SubReg = CopyUse.getOperand(1).getSubReg(); if (SubReg != AMDGPU::NoSubRegister) @@ -563,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } TII->moveToVALU(MI); + } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } break; diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index e10f1ed3762e8..f391f67a241f1 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -13,6 +13,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -166,6 +167,8 @@ static bool updateOperand(FoldCandidate &Fold, if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && TargetRegisterInfo::isVirtualRegister(New->getReg())) { Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + + Old.setIsUndef(New->isUndef()); return true; } @@ -470,7 +473,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); - if (Def->isMoveImmediate()) { + if (Def && Def->isMoveImmediate()) { MachineOperand &ImmSrc = Def->getOperand(1); if (ImmSrc.isImm()) return &ImmSrc; @@ -921,12 +924,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // level. bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { + for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index b1bd14e421f02..08a64de385018 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { + if (ST.isAmdCodeObjectV2(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } @@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasPrivateMemoryInputPtr()) { + if (MFI->hasImplicitBufferPtr()) { unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); BuildMI(MBB, I, DL, Mov64, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else { const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); @@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineMemOperand::MODereferenceable, 0, 0); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset .addImm(0) // glc .addMemOperand(MMO) @@ -417,14 +417,69 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - if (MFI->isEntryFunction()) + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (FuncInfo->isEntryFunction()) { emitEntryFunctionPrologue(MF, MBB); + return; + } + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + + MachineBasicBlock::iterator MBBI = MBB.begin(); + DebugLoc DL; + + bool NeedFP = hasFP(MF); + if (NeedFP) { + // If we need a base pointer, set it up here. It's whatever the value of + // the stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // locals. + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + uint32_t NumBytes = MFI.getStackSize(); + if (NumBytes != 0 && hasSP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(NumBytes * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (FuncInfo->isEntryFunction()) + return; + unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + if (StackPtrReg == AMDGPU::NoRegister) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL; + + // FIXME: Clarify distinction between no set SP and SP. For callee functions, + // it's really whether we need SP to be accurate or not. + + if (NumBytes != 0 && hasSP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(NumBytes * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); + } } static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { @@ -557,3 +612,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); } } + +bool SIFrameLowering::hasFP(const MachineFunction &MF) const { + // All stack operations are relative to the frame offset SGPR. + // TODO: Still want to eliminate sometimes. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // XXX - Is this only called after frame is finalized? Should be able to check + // frame size. + return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); +} + +bool SIFrameLowering::hasSP(const MachineFunction &MF) const { + // All stack operations are relative to the frame offset SGPR. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.hasCalls() || MFI.hasVarSizedObjects(); +} diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index e17adbe273614..d4dfa1c7eaa86 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -60,6 +60,10 @@ private: /// \brief Emits debugger prologue. void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; + +public: + bool hasFP(const MachineFunction &MF) const override; + bool hasSP(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 441f1ef4bd04c..d0f4e00994de1 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -211,6 +211,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UADDO, MVT::i32, Legal); setOperationAction(ISD::USUBO, MVT::i32, Legal); + setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); + setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, @@ -471,6 +474,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::ADDCARRY); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::SUBCARRY); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1061,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasPrivateMemoryInputPtr()) { - unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI); - MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass); - CCInfo.AllocateReg(PrivateMemoryPtrReg); + if (Info.hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -1227,7 +1234,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, } } - if (NeedSP){ + if (NeedSP) { unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF); Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg); @@ -2998,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + if (getSubtarget()->isAmdCodeObjectV2(MF)) + return emitNonHSAIntrinsicError(DAG, DL, VT); + + unsigned Reg = TRI->getPreloadedValue(MF, + SIRegisterInfo::IMPLICIT_BUFFER_PTR); return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); } case Intrinsic::amdgcn_dispatch_ptr: @@ -3288,6 +3299,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -3313,7 +3326,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // glc Op.getOperand(6) // slc }; - MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? @@ -3328,6 +3340,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); } + case Intrinsic::amdgcn_tbuffer_load: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + + EVT VT = Op.getOperand(2).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -3393,10 +3428,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp: { @@ -3463,33 +3498,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, Op.getOperand(2), Op.getOperand(3)); } - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); - } case AMDGPUIntrinsic::AMDGPU_kill: { SDValue Src = Op.getOperand(2); if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { @@ -3505,7 +3513,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { - const MachineFunction &MF = DAG.getMachineFunction(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) @@ -3514,6 +3521,75 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } return SDValue(); }; + case AMDGPUIntrinsic::SI_tbuffer_store: { + + // Extract vindex and voffset from vaddr as appropriate + const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10)); + const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11)); + SDValue VAddr = Op.getOperand(5); + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + + assert(!(OffEn->isOne() && IdxEn->isOne()) && + "Legacy intrinsic doesn't support both offset and index - use new version"); + + SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; + SDValue VOffset = OffEn->isOne() ? VAddr : Zero; + + // Deal with the vec-3 case + const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4)); + auto Opcode = NumChannels->getZExtValue() == 3 ? + AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; + + SDValue Ops[] = { + Chain, + Op.getOperand(3), // vdata + Op.getOperand(2), // rsrc + VIndex, + VOffset, + Op.getOperand(6), // soffset + Op.getOperand(7), // inst_offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(12), // glc + Op.getOperand(13), // slc + }; + + assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && + "Value of tfe other than zero is unsupported"); + + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(Opcode, DL, + Op->getVTList(), Ops, VT, MMO); + } + + case Intrinsic::amdgcn_tbuffer_store: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(10), // glc + Op.getOperand(11) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + default: return Op; } @@ -4839,6 +4915,103 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +SDValue SITargetLowering::performAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // add x, zext (setcc) => addcarry x, 0, setcc + // add x, sext (setcc) => subcarry x, 0, setcc + unsigned Opc = LHS.getOpcode(); + if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || + Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY) + std::swap(RHS, LHS); + + Opc = RHS.getOpcode(); + switch (Opc) { + default: break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + auto Cond = RHS.getOperand(0); + if (Cond.getOpcode() != ISD::SETCC && + Cond.getOpcode() != AMDGPUISD::FP_CLASS) + break; + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); + SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; + return DAG.getNode(Opc, SL, VTList, Args); + } + case ISD::ADDCARRY: { + // add x, (addcarry y, 0, cc) => addcarry x, y, cc + auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!C || C->getZExtValue() != 0) break; + SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) }; + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args); + } + } + return SDValue(); +} + +SDValue SITargetLowering::performSubCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + unsigned Opc = LHS.getOpcode(); + if (Opc != ISD::SUBCARRY) + std::swap(RHS, LHS); + + if (LHS.getOpcode() == ISD::SUBCARRY) { + // sub (subcarry x, 0, cc), y => subcarry x, y, cc + auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + if (!C || C->getZExtValue() != 0) + return SDValue(); + SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; + return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); + } + return SDValue(); +} + +SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + if (N->getValueType(0) != MVT::i32) + return SDValue(); + + auto C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C || C->getZExtValue() != 0) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + + // addcarry (add x, y), 0, cc => addcarry x, y, cc + // subcarry (sub x, y), 0, cc => subcarry x, y, cc + unsigned LHSOpc = LHS.getOpcode(); + unsigned Opc = N->getOpcode(); + if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) || + (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) { + SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; + return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); + } + return SDValue(); +} + SDValue SITargetLowering::performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) @@ -5009,6 +5182,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::ADD: + return performAddCombine(N, DCI); + case ISD::SUB: + return performSubCombine(N, DCI); + case ISD::ADDCARRY: + case ISD::SUBCARRY: + return performAddCarrySubCarryCombine(N, DCI); case ISD::FADD: return performFAddCombine(N, DCI); case ISD::FSUB: @@ -5425,15 +5605,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { - SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); - - return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), - cast<RegisterSDNode>(VReg)->getReg(), VT); -} - //===----------------------------------------------------------------------===// // SI Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 8e2ec40b224cd..24f88e632d38e 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -108,6 +108,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; + SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -216,8 +219,6 @@ public: void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; - SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const override; SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 1097814e99ce2..c9b48fea7225e 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2108,7 +2108,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) + if (!MO.isImm() || + OperandType < AMDGPU::OPERAND_SRC_FIRST || + OperandType > AMDGPU::OPERAND_SRC_LAST) return false; // MachineOperand provides no way to tell the true operand size, since it only @@ -2433,8 +2435,73 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // Verify SDWA + if (isSDWA(MI)) { + + if (!ST.hasSDWA()) { + ErrInfo = "SDWA is not supported on this target"; + return false; + } + + int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + if ( DstIdx == -1) + DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst); + + const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; + + for (int OpIdx: OpIndicies) { + if (OpIdx == -1) + continue; + const MachineOperand &MO = MI.getOperand(OpIdx); + + if (!ST.hasSDWAScalar()) { + // Only VGPRS on VI + if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { + ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; + return false; + } + } else { + // No immediates on GFX9 + if (!MO.isReg()) { + ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; + return false; + } + } + } + + if (!ST.hasSDWAOmod()) { + // No omod allowed on VI + const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod != nullptr && + (!OMod->isImm() || OMod->getImm() != 0)) { + ErrInfo = "OMod not allowed in SDWA instructions on VI"; + return false; + } + } + + uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); + if (isVOPC(BasicOpcode)) { + if (!ST.hasSDWASdst() && DstIdx != -1) { + // Only vcc allowed as dst on VI for VOPC + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { + ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; + return false; + } + } else if (!ST.hasSDWAClampVOPC()) { + // No clamp allowed on GFX9 for VOPC + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp != nullptr && + (!Clamp->isImm() || Clamp->getImm() != 0)) { + ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; + return false; + } + } + } + } + // Verify VOP* - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index f6e5e8883f63c..74b48c7618087 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -814,6 +814,9 @@ namespace AMDGPU { int getSDWAOp(uint16_t Opcode); LLVM_READONLY + int getBasicFromSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 470a47b024433..3b4a8b5d1e817 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -20,6 +20,8 @@ def SIEncodingFamily { int NONE = -1; int SI = 0; int VI = 1; + int SDWA = 2; + int SDWA9 = 3; } //===----------------------------------------------------------------------===// @@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", - SDTypeProfile<0, 13, - [SDTCisVT<0, v4i32>, // rsrc(SGPR) - SDTCisVT<1, iAny>, // vdata(VGPR) - SDTCisVT<2, i32>, // num_channels(imm) - SDTCisVT<3, i32>, // vaddr(VGPR) +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", + SDTypeProfile<1, 9, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // dfmt(imm) SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // offen(imm) - SDTCisVT<9, i32>, // idxen(imm) - SDTCisVT<10, i32>, // glc(imm) - SDTCisVT<11, i32>, // slc(imm) - SDTCisVT<12, i32> // tfe(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) ]>, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain] + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; +def SDTtbuffer_store : SDTypeProfile<0, 10, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) + ]>; + +def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; + def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -452,25 +470,25 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } -class SDWA9Src : RegisterOperand<VS_32> { +class SDWASrc : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_SDWA9_SRC"; - let EncoderMethod = "getSDWA9SrcEncoding"; + let OperandType = "OPERAND_SDWA_SRC"; + let EncoderMethod = "getSDWASrcEncoding"; } -def SDWA9Src32 : SDWA9Src { - let DecoderMethod = "decodeSDWA9Src32"; +def SDWASrc32 : SDWASrc { + let DecoderMethod = "decodeSDWASrc32"; } -def SDWA9Src16 : SDWA9Src { - let DecoderMethod = "decodeSDWA9Src16"; +def SDWASrc16 : SDWASrc { + let DecoderMethod = "decodeSDWASrc16"; } -def SDWA9VopcDst : VOPDstOperand<SReg_64> { +def SDWAVopcDst : VOPDstOperand<SReg_64> { let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_SDWA9_VOPC_DST"; - let EncoderMethod = "getSDWA9VopcDstEncoding"; - let DecoderMethod = "decodeSDWA9VopcDst"; + let OperandType = "OPERAND_SDWA_VOPC_DST"; + let EncoderMethod = "getSDWAVopcDstEncoding"; + let DecoderMethod = "decodeSDWAVopcDst"; } class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { @@ -525,7 +543,7 @@ def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>; def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>; -def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>; +def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>; def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; @@ -545,6 +563,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; +def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>; +def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>; + def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; @@ -634,13 +655,13 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; -def FPRegInputModsMatchClass : AsmOperandClass { - let Name = "RegWithFPInputMods"; +def FPRegSDWAInputModsMatchClass : AsmOperandClass { + let Name = "SDWARegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isRegKind"; + let PredicateMethod = "isSDWARegKind"; } -def FPRegInputMods : InputMods <FPRegInputModsMatchClass> { +def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> { let PrintMethod = "printOperandAndFPInputMods"; } @@ -655,13 +676,13 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { } -def IntRegInputModsMatchClass : AsmOperandClass { - let Name = "RegWithIntInputMods"; +def IntRegSDWAInputModsMatchClass : AsmOperandClass { + let Name = "SDWARegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isRegKind"; + let PredicateMethod = "isSDWARegKind"; } -def IntRegInputMods : InputMods <IntRegInputModsMatchClass> { +def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> { let PrintMethod = "printOperandAndIntInputMods"; } @@ -851,10 +872,10 @@ class getVALUDstForVT<ValueType VT> { } // Returns the register class to use for the destination of VOP[12C] -// instructions with GFX9 SDWA extension -class getSDWA9DstForVT<ValueType VT> { +// instructions with SDWA extension +class getSDWADstForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 1), - SDWA9VopcDst, // VOPC + SDWAVopcDst, // VOPC VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst } @@ -898,8 +919,8 @@ class getVregSrcForVT<ValueType VT> { !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } -class getSDWA9SrcForVT <ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32); +class getSDWASrcForVT <ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32); } // Returns the register class to use for sources of VOP3 instructions for the @@ -995,7 +1016,7 @@ class getSrcMod <ValueType VT> { ); } -// Return type of input modifiers operand specified input operand for SDWA/DPP +// Return type of input modifiers operand specified input operand for DPP class getSrcModExt <ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, @@ -1004,13 +1025,13 @@ class getSrcModExt <ValueType VT> { Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } -// Return type of input modifiers operand specified input operand for SDWA 9 -class getSrcModSDWA9 <ValueType VT> { +// Return type of input modifiers operand specified input operand for SDWA +class getSrcModSDWA <ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, 0))); - Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods); + Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1141,36 +1162,12 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, /* endif */))); } -class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, - bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod, - ValueType DstVT> { - dag ret = !if(!eq(NumSrcArgs, 0), - // VOP1 without input operands (V_NOP) - (ins), - !if(!eq(NumSrcArgs, 1), - // VOP1_SDWA - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel), - !if(!eq(NumSrcArgs, 2), - !if(!eq(DstVT.Size, 1), - // VOPC_SDWA with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel)), - (ins)/* endif */))); -} -// Ins for GFX9 SDWA -class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, - bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod, - ValueType DstVT> { +// Ins for SDWA +class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, + bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod, + ValueType DstVT> { dag ret = !if(!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1178,31 +1175,31 @@ class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArg !if(!eq(NumSrcArgs, 1), // VOP1 !if(!eq(HasSDWAOMod, 0), - // VOP1_SDWA9 without omod + // VOP1_SDWA without omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel), - // VOP1_SDWA9 with omod + // VOP1_SDWA with omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel)), !if(!eq(NumSrcArgs, 2), !if(!eq(DstVT.Size, 1), - // VOPC_SDWA9 + // VOPC_SDWA (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, - src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA9 + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA !if(!eq(HasSDWAOMod, 0), - // VOP2_SDWA9 without omod + // VOP2_SDWA without omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP1_SDWA9 with omod + // VOP2_SDWA with omod (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, omod:$omod, @@ -1220,12 +1217,12 @@ class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> { (outs)); // V_NOP } -// Outs for GFX9 SDWA -class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> { +// Outs for SDWA +class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> { dag ret = !if(HasDst, !if(!eq(DstVT.Size, 1), - (outs DstRCSDWA9:$sdst), - (outs DstRCSDWA9:$vdst)), + (outs DstRCSDWA:$sdst), + (outs DstRCSDWA:$vdst)), (outs)); // V_NOP } @@ -1387,8 +1384,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field ValueType Src2VT = ArgVT[3]; field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret; - field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret; - field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret; + field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -1396,19 +1392,15 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret; field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; - field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret; - field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; - field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; - field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; + field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; + field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret; field Operand Src0Mod = getSrcMod<Src0VT>.ret; field Operand Src1Mod = getSrcMod<Src1VT>.ret; field Operand Src2Mod = getSrcMod<Src2VT>.ret; field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret; field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; - field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; - field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; - field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret; - field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret; + field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret; + field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); @@ -1457,8 +1449,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; - field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; - field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret; + field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, @@ -1471,11 +1462,9 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, - HasModifiers, Src0ModSDWA, Src1ModSDWA, + HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; - field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs, - HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9, - DstVT>.ret; + field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret; @@ -1628,13 +1617,13 @@ def getSDWAOp : InstrMapping { let ValueCols = [["SDWA"]]; } -// Maps ordinary instructions to their SDWA GFX9 counterparts -def getSDWA9Op : InstrMapping { +// Maps SDWA instructions to their ordinary counterparts +def getBasicFromSDWAOp : InstrMapping { let FilterClass = "VOP"; let RowFields = ["OpName"]; let ColFields = ["AsmVariantName"]; - let KeyCol = ["Default"]; - let ValueCols = [["SDWA9"]]; + let KeyCol = ["SDWA"]; + let ValueCols = [["Default"]]; } def getMaskedMIMGOp : InstrMapping { @@ -1669,7 +1658,9 @@ def getMCOpcodeGen : InstrMapping { let ColFields = ["Subtarget"]; let KeyCol = [!cast<string>(SIEncodingFamily.NONE)]; let ValueCols = [[!cast<string>(SIEncodingFamily.SI)], - [!cast<string>(SIEncodingFamily.VI)]]; + [!cast<string>(SIEncodingFamily.VI)], + [!cast<string>(SIEncodingFamily.SDWA)], + [!cast<string>(SIEncodingFamily.SDWA9)]]; } // Get equivalent SOPK instruction. diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 18b197ddb7ae7..3203c38dae344 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), - PrivateMemoryInputPtr(false) { + ImplicitBufferPtr(false) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const Function *F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); @@ -86,6 +86,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; ScratchWaveOffsetReg = AMDGPU::SGPR4; FrameOffsetReg = AMDGPU::SGPR5; + StackPtrOffsetReg = AMDGPU::SGPR32; return; } @@ -150,7 +151,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) DispatchID = true; } else if (ST.isMesaGfxShader(MF)) { if (HasStackObjects || MaySpill) - PrivateMemoryInputPtr = true; + ImplicitBufferPtr = true; } // We don't need to worry about accessing spills with flat instructions. @@ -203,11 +204,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { return FlatScratchInitUserSGPR; } -unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { - PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg( +unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { + ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); NumUserSGPRs += 2; - return PrivateMemoryPtrUserSGPR; + return ImplicitBufferPtrUserSGPR; } /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 9fdb8caac6f21..05aa249584bf1 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned StackPtrOffsetReg; // Input registers for non-HSA ABI - unsigned PrivateMemoryPtrUserSGPR; + unsigned ImplicitBufferPtrUserSGPR; // Input registers setup for the HSA ABI. // User SGPRs in allocation order. @@ -179,7 +179,7 @@ private: // Private memory buffer // Compute directly in sgpr[0:1] // Other shaders indirect 64-bits at sgpr[0:1] - bool PrivateMemoryInputPtr : 1; + bool ImplicitBufferPtr : 1; MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); @@ -236,7 +236,7 @@ public: unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addDispatchID(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); - unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI); + unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -341,8 +341,8 @@ public: return WorkItemIDZ; } - bool hasPrivateMemoryInputPtr() const { - return PrivateMemoryInputPtr; + bool hasImplicitBufferPtr() const { + return ImplicitBufferPtr; } unsigned getNumUserSGPRs() const { @@ -396,8 +396,8 @@ public: return QueuePtrUserSGPR; } - unsigned getPrivateMemoryPtrUserSGPR() const { - return PrivateMemoryPtrUserSGPR; + unsigned getImplicitBufferPtrUserSGPR() const { + return ImplicitBufferPtrUserSGPR; } bool hasSpilledSGPRs() const { diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index f4ddf1891683b..4ac23ef03cb32 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -67,9 +67,9 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); - bool isConvertibleToSDWA(const MachineInstr &MI) const; + bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); - void legalizeScalarOperands(MachineInstr &MI) const; + void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; StringRef getPassName() const override { return "SI Peephole SDWA"; } @@ -224,7 +224,7 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { static bool isSubregOf(const MachineOperand &SubReg, const MachineOperand &SuperReg, const TargetRegisterInfo *TRI) { - + if (!SuperReg.isReg() || !SubReg.isReg()) return false; @@ -557,7 +557,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - + if (TRI->isPhysicalRegister(Src0->getReg()) || TRI->isPhysicalRegister(Dst->getReg())) break; @@ -590,7 +590,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { break; MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - + if (TRI->isPhysicalRegister(Src1->getReg()) || TRI->isPhysicalRegister(Dst->getReg())) break; @@ -607,16 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const { +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, + const SISubtarget &ST) const { // Check if this instruction has opcode that supports SDWA - unsigned Opc = MI.getOpcode(); - if (AMDGPU::getSDWAOp(Opc) != -1) - return true; - int Opc32 = AMDGPU::getVOPe32(Opc); - if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1) - return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) && - !TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - return false; + int Opc = MI.getOpcode(); + if (AMDGPU::getSDWAOp(Opc) == -1) + Opc = AMDGPU::getVOPe32(Opc); + + if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1) + return false; + + if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->isVOPC(Opc)) { + if (!ST.hasSDWASdst()) { + const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (SDst && SDst->getReg() != AMDGPU::VCC) + return false; + } + + if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + return false; + } + + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + Opc == AMDGPU::V_MAC_F32_e32)) + return false; + + return true; } bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, @@ -641,6 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, if (Dst) { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); SDWAInst.add(*Dst); + } else { + Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + assert(Dst && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); + SDWAInst.add(*Dst); } // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and @@ -677,9 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAInst.add(*Src2); } - // Initialize clamp. + // Copy clamp if present, initialize otherwise assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); - SDWAInst.addImm(0); + MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp) { + SDWAInst.add(*Clamp); + } else { + SDWAInst.addImm(0); + } + + // Copy omod if present, initialize otherwise if needed + MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1); + SDWAInst.add(*OMod); + } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { + SDWAInst.addImm(0); + } // Initialize dst_sel and dst_unused if present if (Dst) { @@ -733,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, } // If an instruction was converted to SDWA it should not have immediates or SGPR -// operands. Copy its scalar operands into VGPRs. -void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const { +// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. +void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const { const MCInstrDesc &Desc = TII->get(MI.getOpcode()); - for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { - MachineOperand &Op = MI.getOperand(I); + unsigned ConstantBusCount = 0; + for (MachineOperand &Op: MI.explicit_uses()) { if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) continue; + + unsigned I = MI.getOperandNo(&Op); if (Desc.OpInfo[I].RegClass == -1 || !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) continue; + + if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && + TRI->isSGPRReg(*MRI, Op.getReg())) { + ++ConstantBusCount; + continue; + } + unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), VGPR); @@ -758,22 +808,20 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const { bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (!ST.hasSDWA() || - !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 + if (!ST.hasSDWA()) return false; - } MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - + // Find all SDWA operands in MF. matchSDWAOperands(MF); for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) { + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } @@ -788,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { bool Ret = !ConvertedInstructions.empty(); while (!ConvertedInstructions.empty()) - legalizeScalarOperands(*ConvertedInstructions.pop_back_val()); + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); return Ret; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index b611f28fcabdf..ef6ad4ad0c8f3 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1044,18 +1044,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned CarryOut = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); unsigned ScaledReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // XXX - Should this use a vector shift? - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) - .addImm(Log2_32(ST.getWavefrontSize())); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) + .addImm(Log2_32(ST.getWavefrontSize())) + .addReg(DiffReg, RegState::Kill); // TODO: Fold if use instruction is another add of a constant. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) - .addReg(CarryOut, RegState::Define | RegState::Dead) - .addImm(Offset) - .addReg(ScaledReg, RegState::Kill); + if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) + .addReg(CarryOut, RegState::Define | RegState::Dead) + .addImm(Offset) + .addReg(ScaledReg, RegState::Kill); + } else { + unsigned ConstOffsetReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) + .addReg(CarryOut, RegState::Define | RegState::Dead) + .addReg(ConstOffsetReg, RegState::Kill) + .addReg(ScaledReg, RegState::Kill); + } MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC); } @@ -1341,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - if (ST.isAmdCodeObjectV2(MF)) { - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - } - assert(MFI->hasPrivateMemoryInputPtr()); - return MFI->PrivateMemoryPtrUserSGPR; + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::IMPLICIT_BUFFER_PTR: + assert(MFI->hasImplicitBufferPtr()); + return MFI->ImplicitBufferPtrUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 8fed6d5f9710f..600cc886cb595 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -197,12 +197,13 @@ public: WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + IMPLICIT_BUFFER_PTR = 15, // VGPRS: - FIRST_VGPR_VALUE = 15, + FIRST_VGPR_VALUE = 16, WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 16, - WORKITEM_ID_Z = 17 + WORKITEM_ID_Y = 17, + WORKITEM_ID_Z = 18 }; /// \brief Returns the physical register that \p Value is stored in. diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index c5f121757e623..96a18544f02ac 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_SUBB_U32_e64: + if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm()) + return false; // Additional verification is needed for sdst/src2. return true; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index f581e69980c79..26515b27bb77d 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -538,6 +538,27 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { Reg == AMDGPU::SCC; } +bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { + + if (Reg0 == Reg1) { + return true; + } + + unsigned SubReg0 = TRI->getSubReg(Reg0, 1); + if (SubReg0 == 0) { + return TRI->getSubRegIndex(Reg1, Reg0) > 0; + } + + for (unsigned Idx = 2; SubReg0 > 0; ++Idx) { + if (isRegIntersect(Reg1, SubReg0, TRI)) { + return true; + } + SubReg0 = TRI->getSubReg(Reg0, Idx); + } + + return false; +} + unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { switch(Reg) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index eff0230d21f57..936e4921a7097 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -271,6 +271,9 @@ bool isGFX9(const MCSubtargetInfo &STI); /// \brief Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); +/// \brief Is there any intersection between registers +bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI); + /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 95b5ef0a49dba..96b33c373f052 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -93,11 +93,6 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } -class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : - VOP_SDWA9_Pseudo <OpName, P, pattern> { - let AsmMatchConverter = "cvtSdwaVOP1"; -} - class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, @@ -117,7 +112,6 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; def _sdwa : VOP1_SDWA_Pseudo <opName, P>; - def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>; } // Special profile for instructions which have clamp @@ -274,12 +268,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); - let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 1>.ret; @@ -545,8 +537,8 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>, - VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; // For now left dpp only for asm/dasm // TODO: add corresponding pseudo diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 657cacaa792ca..7b9bc71ad4c77 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -114,11 +114,6 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } -class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : - VOP_SDWA9_Pseudo <OpName, P, pattern> { - let AsmMatchConverter = "cvtSdwaVOP2"; -} - class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -139,7 +134,6 @@ multiclass VOP2Inst <string opName, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; def _sdwa : VOP2_SDWA_Pseudo <opName, P>; - def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>; } multiclass VOP2bInst <string opName, @@ -156,10 +150,6 @@ multiclass VOP2bInst <string opName, def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; } - - def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> { - let AsmMatchConverter = "cvtSdwaVOP2b"; - } } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -221,17 +211,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, VGPR_32:$src2, // stub argument - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, - VGPR_32:$src2, // stub argument - clampmod:$clamp, omod:$omod, - dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; @@ -289,15 +275,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, - clampmod:$clamp, omod:$omod, - dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, @@ -326,6 +307,8 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; + let HasExt = 0; + let HasSDWA9 = 0; } def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { @@ -335,6 +318,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; + let HasExt = 0; + let HasSDWA9 = 0; } //===----------------------------------------------------------------------===// @@ -397,20 +382,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">; } // End isConvergent = 1 -defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; -defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>; -defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>; -defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>; -defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; -defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" -defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>; -defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>; -defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>; -defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>; +defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>; +defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; +defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; +defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>; } // End SubtargetPredicate = isGCN +def : Pat< + (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), + (V_ADDC_U32_e64 $src0, $src1, $src2) +>; + +def : Pat< + (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), + (V_SUBB_U32_e64 $src0, $src1, $src2) +>; // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -728,8 +722,8 @@ multiclass VOP2_SDWA_Real <bits<6> op> { multiclass VOP2_SDWA9_Real <bits<6> op> { def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>, - VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } multiclass VOP2be_Real_e32e64_vi <bits<6> op> : diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index cd347b86d3050..f3482a22d5dcd 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -113,11 +113,6 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOPC"; } -class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : - VOP_SDWA9_Pseudo <OpName, P, pattern> { - let AsmMatchConverter = "cvtSdwaVOPC"; -} - // This class is used only with VOPC instructions. Use $sdst for out operand class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl { @@ -189,13 +184,6 @@ multiclass VOPC_Pseudos <string opName, let isConvergent = DefExec; let isCompare = 1; } - - def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let SchedRW = P.Schedule; - let isConvergent = DefExec; - let isCompare = 1; - } } def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; @@ -540,14 +528,12 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : VOPC_Profile<sched, vt, i32> { let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$sdst, $src0_modifiers, $src1"; + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, - Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, - src0_sel:$src0_sel, src1_sel:$src1_sel); + let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; - //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let HasSrc1Mods = 0; let HasClamp = 0; let HasOMod = 0; @@ -580,12 +566,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> { let SchedRW = p.Schedule; let isConvergent = DefExec; } - - def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let SchedRW = p.Schedule; - let isConvergent = DefExec; - } } def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; @@ -954,8 +934,8 @@ multiclass VOPC_Real_vi <bits<10> op> { VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>, - VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, + VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), !cast<Instruction>(NAME#"_e32_vi")> { diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 4da654f84f9d1..e386f21c2ba49 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -232,11 +232,11 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 { let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 - let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0) - let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1) - let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) - let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2) + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2) let Inst{15} = !if(P.HasClamp, clamp{0}, 0); @@ -245,8 +245,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 { let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); let Inst{58-50} = !if(P.HasSrc2, src2, 0); - let Inst{59} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0) - let Inst{60} = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1) + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1) let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) @@ -300,6 +300,19 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); } +// GFX9 adds two features to SDWA: +// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +// than VGPRs (at most 1 can be an SGPR); +// b. OMOD is the standard output modifier (result *2, *4, /2) +// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This +// replaces OMOD and the dest fields with SD and SDST (SGPR destination) +// field. +// a. When SD=1, the SDST is used as the destination for the compare result; +// b. When SD=0, VCC is used. +// +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + // gfx9 SDWA basic encoding class VOP_SDWA9e<VOPProfile P> : Enc64 { bits<9> src0; // {src0_sgpr{0}, src0{7-0}} @@ -353,6 +366,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : string Mnemonic = opName; string AsmOperands = P.AsmSDWA; + string AsmOperands9 = P.AsmSDWA9; let Size = 8; let mayLoad = 0; @@ -372,53 +386,9 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : VOPProfile Pfl = P; } -// GFX9 adds two features to SDWA: -// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. -// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather -// than VGPRs (at most 1 can be an SGPR); -// b. OMOD is the standard output modifier (result *2, *4, /2) -// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This -// replaces OMOD and the dest fields with SD and SDST (SGPR destination) -// field. -// a. When SD=1, the SDST is used as the destination for the compare result; -// b.when SD=0, VCC is used. -// -// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA - -class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : - InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>, - VOP <opName>, - SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>, - MnemonicAlias <opName#"_sdwa9", opName> { - - let isPseudo = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; - - string Mnemonic = opName; - string AsmOperands = P.AsmSDWA9; - - let Size = 8; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - - let VALU = 1; - let SDWA = 1; - let Uses = [EXEC]; - - let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); - let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); - let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9, - AMDGPUAsmVariants.Disable); - let DecoderNamespace = "SDWA9"; - - VOPProfile Pfl = P; -} - class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, - SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -443,9 +413,9 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : let TSFlags = ps.TSFlags; } -class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, - SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { +class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -458,13 +428,15 @@ class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; + let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA9"; + // Copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let AssemblerPredicate = ps.AssemblerPredicate; let AsmMatchConverter = ps.AsmMatchConverter; - let AsmVariantName = ps.AsmVariantName; let UseNamedOperandTable = ps.UseNamedOperandTable; - let DecoderNamespace = ps.DecoderNamespace; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index ca68f5d42c32c..6f67183df6a18 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -100,7 +100,8 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", "Enable Reliability, Availability and Serviceability extensions">; def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true", "Enable fast computation of positive address offsets">; - +def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true", + "CPU fuses AES crypto operations">; // Cyclone has preferred instructions for zeroing VFP registers, which can // execute in 0 cycles. diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index f9da036c7e468..90f635c812542 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1504,6 +1504,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case ARM::CONSTPOOL_ENTRY: { + if (Subtarget->genExecuteOnly()) + llvm_unreachable("execute-only should not generate constant pools"); + /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool /// in the function. The first operand is the ID# for this instruction, the /// second is the index into the MachineConstantPool that this is, the third diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 8715657ad5e25..e0810c358f2da 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -665,12 +665,14 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const { const ARMFunctionInfo *AFI = MI.getParent()->getParent()->getInfo<ARMFunctionInfo>(); + // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM. + // In their ARM encoding, they can't be encoded in a conditional form. + if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) + return false; + if (AFI->isThumb2Function()) { if (getSubtarget().restrictIT()) return isV8EligibleForIT(&MI); - } else { // non-Thumb - if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) - return false; } return true; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 5b2d093e8f0da..2bcc707e9fc3c 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -2669,12 +2669,35 @@ static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOVi. -static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here SDLoc dl(Op); ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); SDValue Res; + + // When generating execute-only code Constant Pools must be promoted to the + // global data section. It's a bit ugly that we can't share them across basic + // blocks, but this way we guarantee that execute-only behaves correct with + // position-independent addressing modes. + if (Subtarget->genExecuteOnly()) { + auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); + auto T = const_cast<Type*>(CP->getType()); + auto C = const_cast<Constant*>(CP->getConstVal()); + auto M = const_cast<Module*>(DAG.getMachineFunction(). + getFunction()->getParent()); + auto GV = new GlobalVariable( + *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, + Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + + Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + + Twine(AFI->createPICLabelUId()) + ); + SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), + dl, PtrVT); + return LowerGlobalAddress(GA, DAG); + } + if (CP->isMachineConstantPoolEntry()) Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlignment()); @@ -3118,6 +3141,19 @@ static bool isReadOnly(const GlobalValue *GV) { isa<Function>(GV); } +SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + switch (Subtarget->getTargetTriple().getObjectFormat()) { + default: llvm_unreachable("unknown object format"); + case Triple::COFF: + return LowerGlobalAddressWindows(Op, DAG); + case Triple::ELF: + return LowerGlobalAddressELF(Op, DAG); + case Triple::MachO: + return LowerGlobalAddressDarwin(Op, DAG); + } +} + SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -7634,21 +7670,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); - case ISD::ConstantPool: - if (Subtarget->genExecuteOnly()) - llvm_unreachable("execute-only should not generate constant pools"); - return LowerConstantPool(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); - case ISD::GlobalAddress: - switch (Subtarget->getTargetTriple().getObjectFormat()) { - default: llvm_unreachable("unknown object format"); - case Triple::COFF: - return LowerGlobalAddressWindows(Op, DAG); - case Triple::ELF: - return LowerGlobalAddressELF(Op, DAG); - case Triple::MachO: - return LowerGlobalAddressDarwin(Op, DAG); - } + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 26da528c19e6d..5044134f5b1e2 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -601,6 +601,8 @@ class InstrItineraryData; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 45471a4e95b39..53db5acbe805c 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -4756,6 +4756,16 @@ def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift", def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift", (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>; +// Aliases for the above with the .w qualifier +def : t2InstAlias<"mov${p}.w $Rd, $shift", + (t2MOVsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>; +def : t2InstAlias<"movs${p}.w $Rd, $shift", + (t2MOVSsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>; +def : t2InstAlias<"mov${p}.w $Rd, $shift", + (t2MOVsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>; +def : t2InstAlias<"movs${p}.w $Rd, $shift", + (t2MOVSsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>; + // ADR w/o the .w suffix def : t2InstAlias<"adr${p} $Rd, $addr", (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>; diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 2ae3bad4076b0..4cb0eca5ee5f8 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -42,6 +42,10 @@ public: private: bool selectImpl(MachineInstr &I) const; + bool selectICmp(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) const; + const ARMBaseInstrInfo &TII; const ARMBaseRegisterInfo &TRI; const ARMBaseTargetMachine &TM; @@ -243,6 +247,105 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, return Opc; } +static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) { + switch (Pred) { + // Needs two compares... + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UEQ: + default: + // AL is our "false" for now. The other two need more compares. + return ARMCC::AL; + case CmpInst::ICMP_EQ: + case CmpInst::FCMP_OEQ: + return ARMCC::EQ; + case CmpInst::ICMP_SGT: + case CmpInst::FCMP_OGT: + return ARMCC::GT; + case CmpInst::ICMP_SGE: + case CmpInst::FCMP_OGE: + return ARMCC::GE; + case CmpInst::ICMP_UGT: + case CmpInst::FCMP_UGT: + return ARMCC::HI; + case CmpInst::FCMP_OLT: + return ARMCC::MI; + case CmpInst::ICMP_ULE: + case CmpInst::FCMP_OLE: + return ARMCC::LS; + case CmpInst::FCMP_ORD: + return ARMCC::VC; + case CmpInst::FCMP_UNO: + return ARMCC::VS; + case CmpInst::FCMP_UGE: + return ARMCC::PL; + case CmpInst::ICMP_SLT: + case CmpInst::FCMP_ULT: + return ARMCC::LT; + case CmpInst::ICMP_SLE: + case CmpInst::FCMP_ULE: + return ARMCC::LE; + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + return ARMCC::NE; + case CmpInst::ICMP_UGE: + return ARMCC::HS; + case CmpInst::ICMP_ULT: + return ARMCC::LO; + } +} + +bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB, + const ARMBaseInstrInfo &TII, + MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) const { + auto &MBB = *MIB->getParent(); + auto InsertBefore = std::next(MIB->getIterator()); + auto &DebugLoc = MIB->getDebugLoc(); + + // Move 0 into the result register. + auto Mov0I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVi)) + .addDef(MRI.createVirtualRegister(&ARM::GPRRegClass)) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + if (!constrainSelectedInstRegOperands(*Mov0I, TII, TRI, RBI)) + return false; + + // Perform the comparison. + auto LHSReg = MIB->getOperand(2).getReg(); + auto RHSReg = MIB->getOperand(3).getReg(); + assert(MRI.getType(LHSReg) == MRI.getType(RHSReg) && + MRI.getType(LHSReg).getSizeInBits() == 32 && + MRI.getType(RHSReg).getSizeInBits() == 32 && + "Unsupported types for comparison operation"); + auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPrr)) + .addUse(LHSReg) + .addUse(RHSReg) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI)) + return false; + + // Move 1 into the result register if the flags say so. + auto ResReg = MIB->getOperand(0).getReg(); + auto Cond = + static_cast<CmpInst::Predicate>(MIB->getOperand(1).getPredicate()); + auto ARMCond = getComparePred(Cond); + if (ARMCond == ARMCC::AL) + return false; + + auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCi)) + .addDef(ResReg) + .addUse(Mov0I->getOperand(0).getReg()) + .addImm(1) + .add(predOps(ARMCond, ARM::CPSR)); + if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI)) + return false; + + MIB->eraseFromParent(); + return true; +} + bool ARMInstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -343,6 +446,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { I.setDesc(TII.get(COPY)); return selectCopy(I, TII, MRI, TRI, RBI); } + case G_ICMP: + return selectICmp(MIB, TII, MRI, TRI, RBI); case G_GEP: I.setDesc(TII.get(ARM::ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index a706079d98662..5873c7fb38729 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -86,6 +86,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_CONSTANT, s32}, Legal); + setAction({G_ICMP, s1}, Legal); + for (auto Ty : {s8, s16}) + setAction({G_ICMP, 1, Ty}, WidenScalar); + for (auto Ty : {s32, p0}) + setAction({G_ICMP, 1, Ty}, Legal); + if (!ST.useSoftFloat() && ST.hasVFP2()) { setAction({G_FADD, s32}, Legal); setAction({G_FADD, s64}, Legal); diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp new file mode 100644 index 0000000000000..1b6e97c28d453 --- /dev/null +++ b/lib/Target/ARM/ARMMacroFusion.cpp @@ -0,0 +1,57 @@ +//===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the ARM implementation of the DAG scheduling +/// mutation to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "ARMMacroFusion.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace llvm { + +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI); + + // Assume wildcards for unspecified instrs. + unsigned FirstOpcode = + FirstMI ? FirstMI->getOpcode() + : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END); + unsigned SecondOpcode = SecondMI.getOpcode(); + + if (ST.hasFuseAES()) + // Fuse AES crypto operations. + switch(SecondOpcode) { + // AES encode. + case ARM::AESMC : + return FirstOpcode == ARM::AESE || + FirstOpcode == ARM::INSTRUCTION_LIST_END; + // AES decode. + case ARM::AESIMC: + return FirstOpcode == ARM::AESD || + FirstOpcode == ARM::INSTRUCTION_LIST_END; + } + + return false; +} + +std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation () { + return createMacroFusionDAGMutation(shouldScheduleAdjacent); +} + +} // end namespace llvm diff --git a/lib/Target/ARM/ARMMacroFusion.h b/lib/Target/ARM/ARMMacroFusion.h new file mode 100644 index 0000000000000..1e4fc6687eae8 --- /dev/null +++ b/lib/Target/ARM/ARMMacroFusion.h @@ -0,0 +1,24 @@ +//===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the ARM definition of the DAG scheduling mutation +/// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createARMMacroFusionDAGMutation()); +/// to ARMPassConfig::createMachineScheduler() to have an effect. +std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation(); + +} // llvm diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index f59b075e6dd9a..2350d0c6ef69e 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -255,6 +255,16 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); break; + case G_ICMP: { + LLT Ty2 = MRI.getType(MI.getOperand(2).getReg()); + (void)Ty2; + assert(Ty2.getSizeInBits() == 32 && "Unsupported size for G_ICMP"); + OperandsMapping = + getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr, + &ARM::ValueMappings[ARM::GPR3OpsIdx], + &ARM::ValueMappings[ARM::GPR3OpsIdx]}); + break; + } case G_MERGE_VALUES: { // We only support G_MERGE_VALUES for creating a double precision floating // point value out of two GPRs. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index af682dd8321cf..d890d0fa777e8 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -285,6 +285,10 @@ protected: /// HasFPAO - if true, processor does positive address offset computation faster bool HasFPAO = false; + /// HasFuseAES - if true, processor executes back to back AES instruction + /// pairs faster. + bool HasFuseAES = false; + /// If true, if conversion may decide to leave some instructions unpredicated. bool IsProfitableToUnpredicate = false; @@ -561,6 +565,10 @@ public: bool hasD16() const { return HasD16; } bool hasFullFP16() const { return HasFullFP16; } + bool hasFuseAES() const { return HasFuseAES; } + /// \brief Return true if the CPU supports any kind of instruction fusion. + bool hasFusion() const { return hasFuseAES(); } + const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index c0506cfda6129..eb71e557ec917 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -17,6 +17,7 @@ #include "ARMRegisterBankInfo.h" #endif #include "ARMSubtarget.h" +#include "ARMMacroFusion.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" #include "ARMTargetTransformInfo.h" @@ -394,6 +395,9 @@ public: createMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMILive *DAG = createGenericSchedLive(C); // add DAG Mutations here. + const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>(); + if (ST.hasFusion()) + DAG->addMutation(createARMMacroFusionDAGMutation()); return DAG; } @@ -401,6 +405,9 @@ public: createPostMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMI *DAG = createGenericSchedPostRA(C); // add DAG Mutations here. + const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>(); + if (ST.hasFusion()) + DAG->addMutation(createARMMacroFusionDAGMutation()); return DAG; } diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 19fba3033bb2b..891b5c60e1fd6 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6860,6 +6860,17 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) { bool ARMAsmParser::processInstruction(MCInst &Inst, const OperandVector &Operands, MCStreamer &Out) { + // Check if we have the wide qualifier, because if it's present we + // must avoid selecting a 16-bit thumb instruction. + bool HasWideQualifier = false; + for (auto &Op : Operands) { + ARMOperand &ARMOp = static_cast<ARMOperand&>(*Op); + if (ARMOp.isToken() && ARMOp.getToken() == ".w") { + HasWideQualifier = true; + break; + } + } + switch (Inst.getOpcode()) { // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction. case ARM::LDRT_POST: @@ -6939,8 +6950,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, // Select the narrow version if the immediate will fit. if (Inst.getOperand(1).getImm() > 0 && Inst.getOperand(1).getImm() <= 0xff && - !(static_cast<ARMOperand &>(*Operands[2]).isToken() && - static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w")) + !HasWideQualifier) Inst.setOpcode(ARM::tLDRpci); else Inst.setOpcode(ARM::t2LDRpci); @@ -6971,10 +6981,9 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, else if (Inst.getOpcode() == ARM::t2LDRConstPool) TmpInst.setOpcode(ARM::t2LDRpci); const ARMOperand &PoolOperand = - (static_cast<ARMOperand &>(*Operands[2]).isToken() && - static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ? - static_cast<ARMOperand &>(*Operands[4]) : - static_cast<ARMOperand &>(*Operands[3]); + (HasWideQualifier ? + static_cast<ARMOperand &>(*Operands[4]) : + static_cast<ARMOperand &>(*Operands[3])); const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm(); // If SubExprVal is a constant we may be able to use a MOV if (isa<MCConstantExpr>(SubExprVal) && @@ -8117,8 +8126,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && isARMLowRegister(Inst.getOperand(1).getReg()) && Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && - !(static_cast<ARMOperand &>(*Operands[3]).isToken() && - static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) { + !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); @@ -8152,7 +8160,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, isARMLowRegister(Inst.getOperand(1).getReg()) && isARMLowRegister(Inst.getOperand(2).getReg()) && Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && - inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr)) + inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr) && + !HasWideQualifier) isNarrow = true; MCInst TmpInst; unsigned newOpc; @@ -8186,7 +8195,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, bool isNarrow = false; if (isARMLowRegister(Inst.getOperand(0).getReg()) && isARMLowRegister(Inst.getOperand(1).getReg()) && - inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi)) + inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi) && + !HasWideQualifier) isNarrow = true; MCInst TmpInst; unsigned newOpc; @@ -8415,10 +8425,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, !isARMLowRegister(Inst.getOperand(0).getReg()) || (Inst.getOperand(2).isImm() && (unsigned)Inst.getOperand(2).getImm() > 255) || - ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) || - (inITBlock() && Inst.getOperand(5).getReg() != 0)) || - (static_cast<ARMOperand &>(*Operands[3]).isToken() && - static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) + Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) || + HasWideQualifier) break; MCInst TmpInst; TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ? @@ -8447,8 +8455,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, } if (!Transform || Inst.getOperand(5).getReg() != 0 || - (static_cast<ARMOperand &>(*Operands[3]).isToken() && - static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) + HasWideQualifier) break; MCInst TmpInst; TmpInst.setOpcode(ARM::tADDhirr); @@ -8568,11 +8575,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && (Inst.getOperand(1).isImm() && (unsigned)Inst.getOperand(1).getImm() <= 255) && - ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL && - Inst.getOperand(4).getReg() == ARM::CPSR) || - (inITBlock() && Inst.getOperand(4).getReg() == 0)) && - (!static_cast<ARMOperand &>(*Operands[2]).isToken() || - static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { + Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + !HasWideQualifier) { // The operands aren't in the same order for tMOVi8... MCInst TmpInst; TmpInst.setOpcode(ARM::tMOVi8); @@ -8593,8 +8597,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, isARMLowRegister(Inst.getOperand(1).getReg()) && Inst.getOperand(2).getImm() == ARMCC::AL && Inst.getOperand(4).getReg() == ARM::CPSR && - (!static_cast<ARMOperand &>(*Operands[2]).isToken() || - static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { + !HasWideQualifier) { // The operands aren't the same for tMOV[S]r... (no cc_out) MCInst TmpInst; TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr); @@ -8616,8 +8619,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && isARMLowRegister(Inst.getOperand(1).getReg()) && Inst.getOperand(2).getImm() == 0 && - (!static_cast<ARMOperand &>(*Operands[2]).isToken() || - static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { + !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("Illegal opcode!"); @@ -8716,11 +8718,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, if ((isARMLowRegister(Inst.getOperand(1).getReg()) && isARMLowRegister(Inst.getOperand(2).getReg())) && Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && - ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || - (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && - (!static_cast<ARMOperand &>(*Operands[3]).isToken() || - !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower( - ".w"))) { + Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); @@ -8756,11 +8755,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, isARMLowRegister(Inst.getOperand(2).getReg())) && (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() || Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) && - ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || - (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && - (!static_cast<ARMOperand &>(*Operands[3]).isToken() || - !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower( - ".w"))) { + Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt index 3cde43967568b..cf6827fd6ca19 100644 --- a/lib/Target/ARM/CMakeLists.txt +++ b/lib/Target/ARM/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_target(ARMCodeGen ARMLoadStoreOptimizer.cpp ARMMCInstLower.cpp ARMMachineFunctionInfo.cpp + ARMMacroFusion.cpp ARMRegisterInfo.cpp ARMOptimizeBarriersPass.cpp ARMSelectionDAGInfo.cpp diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 716492ea25662..81760f03940ad 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -358,11 +358,27 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf, return Value; } -unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value, +unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target, uint64_t Value, bool IsPCRel, MCContext &Ctx, bool IsLittleEndian, bool IsResolved) const { unsigned Kind = Fixup.getKind(); + + // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT + // and .word relocations they put the Thumb bit into the addend if possible. + // Other relocation types don't want this bit though (branches couldn't encode + // it if it *was* present, and no other relocations exist) and it can + // interfere with checking valid expressions. + if (const MCSymbolRefExpr *A = Target.getSymA()) { + if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) && + (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 || + Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 || + Kind == ARM::fixup_t2_movt_hi16)) + Value |= 1; + } + switch (Kind) { default: Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type"); @@ -505,6 +521,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value, return swapHalfWords(out, IsLittleEndian); } case ARM::fixup_arm_thumb_bl: { + // FIXME: We get both thumb1 and thumb2 in here, so we can only check for + // the less strict thumb2 value. + if (!isInt<26>(Value - 4)) { + Ctx.reportError(Fixup.getLoc(), "Relocation out of range"); + return 0; + } + // The value doesn't encode the low bit (always zero) and is offset by // four. The 32-bit immediate value is encoded as // imm32 = SignExtend(S:I1:I2:imm10:imm11:0) @@ -716,29 +739,11 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, const MCFixup &Fixup, - const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) { + const MCValue &Target, bool &IsResolved) { const MCSymbolRefExpr *A = Target.getSymA(); const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; const unsigned FixupKind = Fixup.getKind() ; - // MachO (the only user of "Value") tries to make .o files that look vaguely - // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit - // into the addend if possible. Other relocation types don't want this bit - // though (branches couldn't encode it if it *was* present, and no other - // relocations exist) and it can interfere with checking valid expressions. - if (FixupKind == FK_Data_4 || - FixupKind == ARM::fixup_arm_movw_lo16 || - FixupKind == ARM::fixup_arm_movt_hi16 || - FixupKind == ARM::fixup_t2_movw_lo16 || - FixupKind == ARM::fixup_t2_movt_hi16) { - if (Sym) { - if (Asm.isThumbFunc(Sym)) - Value |= 1; - } - } if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { assert(Sym && "How did we resolve this?"); @@ -747,7 +752,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // If the symbol is out of range, produce a relocation and hope the // linker can handle it. GNU AS produces an error in this case. - if (Sym->isExternal() || Value >= 0x400004) + if (Sym->isExternal()) IsResolved = false; } // Create relocations for unconditional branches to function symbols with @@ -759,6 +764,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, IsResolved = false; if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br || FixupKind == ARM::fixup_arm_thumb_bl || + FixupKind == ARM::fixup_t2_condbranch || FixupKind == ARM::fixup_t2_uncondbranch)) IsResolved = false; } @@ -875,22 +881,25 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { } } -void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, bool IsPCRel, - MCContext &Ctx) const { +void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - Value = adjustFixupValue(Fixup, Value, IsPCRel, Ctx, IsLittleEndian, true); + MCContext &Ctx = Asm.getContext(); + Value = adjustFixupValue(Asm, Fixup, Target, Value, IsPCRel, Ctx, + IsLittleEndian, true); if (!Value) return; // Doesn't change encoding. unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FullSizeBytes; if (!IsLittleEndian) { FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind()); - assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!"); + assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!"); assert(NumBytes <= FullSizeBytes && "Invalid fixup size!"); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 2ddedb5d61059..6a0ba2ed41c1a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -40,17 +40,17 @@ public: /// processFixupValue - Target hook to process the literal value of a fixup /// if necessary. - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; + void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, bool &IsResolved) override; - unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel, + unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, uint64_t Value, bool IsPCRel, MCContext &Ctx, bool IsLittleEndian, bool IsResolved) const; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; unsigned getRelaxedOpcode(unsigned Op) const; diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 00505a103e00f..f74fb2e20b5a3 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -33,8 +33,8 @@ public: ~ARMWinCOFFObjectWriter() override = default; - unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection, + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const override; bool recordRelocation(const MCFixup &) const override; @@ -42,7 +42,8 @@ public: } // end anonymous namespace -unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target, +unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const { diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp index 7d5fb6ca17b98..c6ddd6bdad5e6 100644 --- a/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -214,7 +214,12 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { if (Opcode != ISD::LOAD) continue; - unsigned char new_val[8]; // hold up the constant values replacing loads. + union { + uint8_t c[8]; + uint16_t s; + uint32_t i; + uint64_t d; + } new_val; // hold up the constant values replacing loads. bool to_replace = false; SDLoc DL(Node); const LoadSDNode *LD = cast<LoadSDNode>(Node); @@ -242,7 +247,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode()); if (GADN && CDN) to_replace = - getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val); + getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c); } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END && LDAddrNode->getNumOperands() > 0) { DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n'); @@ -250,7 +255,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { SDValue OP1 = LDAddrNode->getOperand(0); if (const GlobalAddressSDNode *GADN = dyn_cast<GlobalAddressSDNode>(OP1.getNode())) - to_replace = getConstantFieldValue(GADN, 0, size, new_val); + to_replace = getConstantFieldValue(GADN, 0, size, new_val.c); } if (!to_replace) @@ -259,13 +264,13 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { // replacing the old with a new value uint64_t val; if (size == 1) - val = *(uint8_t *)new_val; + val = new_val.c[0]; else if (size == 2) - val = *(uint16_t *)new_val; + val = new_val.s; else if (size == 4) - val = *(uint32_t *)new_val; + val = new_val.i; else { - val = *(uint64_t *)new_val; + val = new_val.d; } DEBUG(dbgs() << "Replacing load of size " << size << " with constant " @@ -318,14 +323,17 @@ bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node, } // test whether host endianness matches target - uint8_t test_buf[2]; + union { + uint8_t c[2]; + uint16_t s; + } test_buf; uint16_t test_val = 0x2345; if (DL.isLittleEndian()) - support::endian::write16le(test_buf, test_val); + support::endian::write16le(test_buf.c, test_val); else - support::endian::write16be(test_buf, test_val); + support::endian::write16be(test_buf.c, test_val); - bool endian_match = *(uint16_t *)test_buf == test_val; + bool endian_match = test_buf.s == test_val; for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++) ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j]; diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 80357a63a4e12..15e89fb2a2611 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -27,8 +27,9 @@ public: : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {} ~BPFAsmBackend() override = default; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -61,9 +62,10 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } -void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, bool IsPCRel, - MCContext &Ctx) const { +void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) { assert(Value == 0); } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) { diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 9f8c9ded8127b..734f3c6658d92 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -567,8 +567,19 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO, } llvm_unreachable("Invalid register operand"); } - if (SO.isImm() || SO.isFPImm()) - return IfTrue ? C2_cmoveit : C2_cmoveif; + switch (SO.getType()) { + case MachineOperand::MO_Immediate: + case MachineOperand::MO_FPImmediate: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_TargetIndex: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_BlockAddress: + return IfTrue ? C2_cmoveit : C2_cmoveif; + default: + break; + } llvm_unreachable("Unexpected source operand"); } diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 18e49c69b8e36..2b0ceaa66258e 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1051,10 +1051,26 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, bool HasExtraAlign = HRI.needsStackRealignment(MF); bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None; + unsigned FrameSize = MFI.getStackSize(); unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister(); auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>(); unsigned AP = HMFI.getStackAlignBasePhysReg(); - unsigned FrameSize = MFI.getStackSize(); + // It may happen that AP will be absent even HasAlloca && HasExtraAlign + // is true. HasExtraAlign may be set because of vector spills, without + // aligned locals or aligned outgoing function arguments. Since vector + // spills will ultimately be "unaligned", it is safe to use FP as the + // base register. + // In fact, in such a scenario the stack is actually not required to be + // aligned, although it may end up being aligned anyway, since this + // particular case is not easily detectable. The alignment will be + // unnecessary, but not incorrect. + // Unfortunately there is no quick way to verify that the above is + // indeed the case (and that it's not a result of an error), so just + // assume that missing AP will be replaced by FP. + // (A better fix would be to rematerialize AP from FP and always align + // vector spills.) + if (AP == 0) + AP = FP; bool UseFP = false, UseAP = false; // Default: use SP (except at -O0). // Use FP at -O0, except when there are objects with extra alignment. @@ -2454,9 +2470,44 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const { unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF); auto &HST = MF.getSubtarget<HexagonSubtarget>(); // A fairly simplistic guess as to whether a potential load/store to a - // stack location could require an extra register. It does not account - // for store-immediate instructions. - if (HST.useHVXOps()) - return StackSize > 256; + // stack location could require an extra register. + if (HST.useHVXOps() && StackSize > 256) + return true; + + // Check if the function has store-immediate instructions that access + // the stack. Since the offset field is not extendable, if the stack + // size exceeds the offset limit (6 bits, shifted), the stores will + // require a new base register. + bool HasImmStack = false; + unsigned MinLS = ~0u; // Log_2 of the memory access size. + + for (const MachineBasicBlock &B : MF) { + for (const MachineInstr &MI : B) { + unsigned LS = 0; + switch (MI.getOpcode()) { + case Hexagon::S4_storeirit_io: + case Hexagon::S4_storeirif_io: + case Hexagon::S4_storeiri_io: + ++LS; + LLVM_FALLTHROUGH; + case Hexagon::S4_storeirht_io: + case Hexagon::S4_storeirhf_io: + case Hexagon::S4_storeirh_io: + ++LS; + LLVM_FALLTHROUGH; + case Hexagon::S4_storeirbt_io: + case Hexagon::S4_storeirbf_io: + case Hexagon::S4_storeirb_io: + if (MI.getOperand(0).isFI()) + HasImmStack = true; + MinLS = std::min(MinLS, LS); + break; + } + } + } + + if (HasImmStack) + return !isUInt<6>(StackSize >> MinLS); + return false; } diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp index 11ac5454f6043..5abbcbba72ddd 100644 --- a/lib/Target/Hexagon/HexagonGenMux.cpp +++ b/lib/Target/Hexagon/HexagonGenMux.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -295,15 +296,12 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0; unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0; bool Failure = false, CanUp = true, CanDown = true; - bool Used1 = false, Used2 = false; for (unsigned X = MinX+1; X < MaxX; X++) { const DefUseInfo &DU = DUM.lookup(X); if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) { Failure = true; break; } - Used1 |= DU.Uses[SR1]; - Used2 |= DU.Uses[SR2]; if (CanDown && DU.Defs[SR1]) CanDown = false; if (CanUp && DU.Defs[SR2]) @@ -317,64 +315,52 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { // Prefer "down", since this will move the MUX farther away from the // predicate definition. MachineBasicBlock::iterator At = CanDown ? Def2 : Def1; - if (CanDown) { - // If the MUX is placed "down", we need to make sure that there aren't - // any kills of the source registers between the two defs. - if (Used1 || Used2) { - auto ResetKill = [this] (unsigned Reg, MachineInstr &MI) -> bool { - if (MachineOperand *Op = MI.findRegisterUseOperand(Reg, true, HRI)) { - Op->setIsKill(false); - return true; - } - return false; - }; - bool KilledSR1 = false, KilledSR2 = false; - for (MachineInstr &MJ : make_range(std::next(It1), It2)) { - if (SR1) - KilledSR1 |= ResetKill(SR1, MJ); - if (SR2) - KilledSR2 |= ResetKill(SR1, MJ); - } - // If any of the source registers were killed in this range, transfer - // the kills to the source operands: they will me "moved" to the - // resulting MUX and their parent instructions will be deleted. - if (KilledSR1) { - assert(Src1->isReg()); - Src1->setIsKill(true); - } - if (KilledSR2) { - assert(Src2->isReg()); - Src2->setIsKill(true); - } - } - } else { - // If the MUX is placed "up", it shouldn't kill any source registers - // that are still used afterwards. We can reset the kill flags directly - // on the operands, because the source instructions will be erased. - if (Used1 && Src1->isReg()) - Src1->setIsKill(false); - if (Used2 && Src2->isReg()) - Src2->setIsKill(false); - } ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2)); } - for (unsigned I = 0, N = ML.size(); I < N; ++I) { - MuxInfo &MX = ML[I]; - MachineBasicBlock &B = *MX.At->getParent(); - DebugLoc DL = MX.At->getDebugLoc(); + for (MuxInfo &MX : ML) { unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF); if (!MxOpc) continue; - BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR) - .addReg(MX.PredR) - .add(*MX.SrcT) - .add(*MX.SrcF); + MachineBasicBlock &B = *MX.At->getParent(); + const DebugLoc &DL = B.findDebugLoc(MX.At); + auto NewMux = BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR) + .addReg(MX.PredR) + .add(*MX.SrcT) + .add(*MX.SrcF); + NewMux->clearKillInfo(); B.erase(MX.Def1); B.erase(MX.Def2); Changed = true; } + // Fix up kill flags. + + LivePhysRegs LPR(*HRI); + LPR.addLiveOuts(B); + auto IsLive = [&LPR,this] (unsigned Reg) -> bool { + for (MCSubRegIterator S(Reg, HRI, true); S.isValid(); ++S) + if (LPR.contains(*S)) + return true; + return false; + }; + for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) { + if (I->isDebugValue()) + continue; + // This isn't 100% accurate, but it's safe. + // It won't detect (as a kill) a case like this + // r0 = add r0, 1 <-- r0 should be "killed" + // ... = r0 + for (MachineOperand &Op : I->operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + assert(Op.getSubReg() == 0 && "Should have physical registers only"); + bool Live = IsLive(Op.getReg()); + Op.setIsKill(!Live); + } + LPR.stepBackward(*I); + } + return Changed; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index f43101fa456d5..fec2dc5ce3066 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -94,10 +94,6 @@ static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec", /// /// Constants for Hexagon instructions. /// -const int Hexagon_MEMV_OFFSET_MAX_128B = 896; // #s4: -8*128...7*128 -const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4 -const int Hexagon_MEMV_OFFSET_MAX = 448; // #s4: -8*64...7*64 -const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4 const int Hexagon_MEMW_OFFSET_MAX = 4095; const int Hexagon_MEMW_OFFSET_MIN = -4096; const int Hexagon_MEMD_OFFSET_MAX = 8191; @@ -2443,8 +2439,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::V6_vS32b_ai: case Hexagon::V6_vL32Ub_ai: case Hexagon::V6_vS32Ub_ai: - return (Offset >= Hexagon_MEMV_OFFSET_MIN) && - (Offset <= Hexagon_MEMV_OFFSET_MAX); + return isShiftedInt<4,6>(Offset); case Hexagon::PS_vstorerq_ai_128B: case Hexagon::PS_vstorerw_ai_128B: @@ -2454,8 +2449,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::V6_vS32b_ai_128B: case Hexagon::V6_vL32Ub_ai_128B: case Hexagon::V6_vS32Ub_ai_128B: - return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) && - (Offset <= Hexagon_MEMV_OFFSET_MAX_128B); + return isShiftedInt<4,7>(Offset); case Hexagon::J2_loop0i: case Hexagon::J2_loop1i: diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp index d73fc7c73185d..de6b203015d8e 100644 --- a/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -629,7 +629,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { if (MO.isReg() && MO.isUse()) { unsigned feederReg = MO.getReg(); for (MachineBasicBlock::iterator localII = feederPos, - end = jmpPos; localII != end; localII++) { + end = cmpInstr->getIterator(); localII != end; localII++) { MachineInstr &localMI = *localII; for (unsigned j = 0; j < localMI.getNumOperands(); j++) { MachineOperand &localMO = localMI.getOperand(j); diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp index ee3209354688d..7d961a238ae28 100644 --- a/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/lib/Target/Hexagon/HexagonPeephole.cpp @@ -100,9 +100,6 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); } - - private: - void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src); }; } @@ -132,7 +129,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { PeepholeDoubleRegsMap.clear(); // Traverse the basic block. - for (MachineInstr &MI : *MBB) { + for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) { + NextI = std::next(I); + MachineInstr &MI = *I; // Look for sign extends: // %vreg170<def> = SXTW %vreg166 if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) { @@ -280,14 +279,13 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { if (NewOp) { unsigned PSrc = MI.getOperand(PR).getReg(); if (unsigned POrig = PeepholeMap.lookup(PSrc)) { - MI.getOperand(PR).setReg(POrig); + BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(), + QII->get(NewOp), MI.getOperand(0).getReg()) + .addReg(POrig) + .add(MI.getOperand(S2)) + .add(MI.getOperand(S1)); MRI->clearKillFlags(POrig); - MI.setDesc(QII->get(NewOp)); - // Swap operands S1 and S2. - MachineOperand Op1 = MI.getOperand(S1); - MachineOperand Op2 = MI.getOperand(S2); - ChangeOpInto(MI.getOperand(S1), Op2); - ChangeOpInto(MI.getOperand(S2), Op1); + MI.eraseFromParent(); } } // if (NewOp) } // if (!Done) @@ -299,40 +297,6 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { return true; } -void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) { - assert (&Dst != &Src && "Cannot duplicate into itself"); - switch (Dst.getType()) { - case MachineOperand::MO_Register: - if (Src.isReg()) { - Dst.setReg(Src.getReg()); - Dst.setSubReg(Src.getSubReg()); - MRI->clearKillFlags(Src.getReg()); - } else if (Src.isImm()) { - Dst.ChangeToImmediate(Src.getImm()); - } else { - llvm_unreachable("Unexpected src operand type"); - } - break; - - case MachineOperand::MO_Immediate: - if (Src.isImm()) { - Dst.setImm(Src.getImm()); - } else if (Src.isReg()) { - Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(), - false, Src.isDead(), Src.isUndef(), - Src.isDebug()); - Dst.setSubReg(Src.getSubReg()); - } else { - llvm_unreachable("Unexpected src operand type"); - } - break; - - default: - llvm_unreachable("Unexpected dst operand type"); - break; - } -} - FunctionPass *llvm::createHexagonPeephole() { return new HexagonPeephole(); } diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 8851a23ae8ace..0aada8a53c979 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -1,4 +1,4 @@ -//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===// +//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===// // // The LLVM Compiler Infrastructure // @@ -11,13 +11,23 @@ // //===----------------------------------------------------------------------===// -#include "HexagonSubtarget.h" #include "Hexagon.h" +#include "HexagonInstrInfo.h" #include "HexagonRegisterInfo.h" +#include "HexagonSubtarget.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> #include <map> using namespace llvm; @@ -119,9 +129,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM) : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU), - InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - FrameLowering() { - + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) { initializeEnvironment(); // Initialize scheduling itinerary for the specified CPU. @@ -196,7 +204,6 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, updateLatency(*SrcInst, *DstInst, Dep); } - void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) { for (auto &SU : DAG->SUnits) { if (!SU.isInstr()) @@ -240,18 +247,18 @@ void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) { } } - void HexagonSubtarget::getPostRAMutations( - std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>()); + std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { + Mutations.push_back( + llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>()); } void HexagonSubtarget::getSMSMutations( - std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>()); + std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { + Mutations.push_back( + llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>()); } - // Pin the vtable to this file. void HexagonSubtarget::anchor() {} @@ -447,4 +454,3 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const { bool HexagonSubtarget::enableSubRegLiveness() const { return EnableSubregLiveness; } - diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 4379efa79c9cd..753dca0000652 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -1,4 +1,4 @@ -//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===// +//===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -15,12 +15,17 @@ #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H #include "HexagonFrameLowering.h" -#include "HexagonISelLowering.h" #include "HexagonInstrInfo.h" +#include "HexagonISelLowering.h" #include "HexagonSelectionDAGInfo.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <memory> #include <string> +#include <vector> #define GET_SUBTARGETINFO_HEADER #include "HexagonGenSubtargetInfo.inc" @@ -30,6 +35,12 @@ namespace llvm { +class MachineInstr; +class SDep; +class SUnit; +class TargetMachine; +class Triple; + class HexagonSubtarget : public HexagonGenSubtargetInfo { virtual void anchor(); @@ -57,6 +68,7 @@ private: HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; InstrItineraryData InstrItins; + void initializeEnvironment(); public: @@ -108,6 +120,7 @@ public: bool useBSBScheduling() const { return UseBSBScheduling; } bool enableMachineScheduler() const override; + // Always use the TargetLowering default scheduler. // FIXME: This will use the vliw scheduler which is probably just hurting // compiler time and will be removed eventually anyway. @@ -124,6 +137,7 @@ public: unsigned getSmallDataThreshold() const { return Hexagon_SMALL_DATA_THRESHOLD; } + const HexagonArchEnum &getHexagonArchVersion() const { return HexagonArchVersion; } @@ -155,4 +169,4 @@ private: } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index e507a797871fc..031a1bdefafbf 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -291,7 +291,6 @@ bool HexagonPassConfig::addInstSelector() { if (EnableBitSimplify) addPass(createHexagonBitSimplify()); addPass(createHexagonPeephole()); - printAndVerify("After hexagon peephole pass"); // Constant propagation. if (!DisableHCP) { addPass(createHexagonConstPropagationPass()); diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 545c8b6b2acde..093ce80bc2e3f 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -202,10 +202,8 @@ public: /// processFixupValue - Target hook to adjust the literal value of a fixup /// if necessary. IsResolved signals whether the caller believes a relocation /// is needed; the target can modify the value. The default does nothing. - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override { + void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, bool &IsResolved) override { MCFixupKind Kind = Fixup.getKind(); switch((unsigned)Kind) { @@ -415,9 +413,9 @@ public: /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided /// data fragment, at the offset specified by the fixup and following the /// fixup kind as appropriate. - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t FixupValue, bool IsPCRel, - MCContext &Ctx) const override { + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t FixupValue, bool IsPCRel) const override { // When FixupValue is 0 the relocation is external and there // is nothing for us to do. @@ -432,8 +430,8 @@ public: // to a real offset before we can use it. uint32_t Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); - char *InstAddr = Data + Offset; + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + char *InstAddr = Data.data() + Offset; Value = adjustFixupValue(Kind, FixupValue); if(!Value) @@ -517,7 +515,7 @@ public: dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x"; dbgs().write_hex(FixupValue) << ": Offset=" << Offset << - ": Size=" << DataSize << + ": Size=" << Data.size() << ": OInst=0x"; dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc);); diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index e8f154a1fa533..c7114c7f18a0a 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -701,33 +701,32 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { break; case Hexagon::A2_addi: Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); - assert(Absolute);(void)Absolute; - if (Value == 1) { - Result.setOpcode(Hexagon::SA1_inc); - addOps(Result, Inst, 0); - addOps(Result, Inst, 1); - break; - } // 1,2 SUBInst $Rd = add($Rs, #1) - else if (Value == -1) { - Result.setOpcode(Hexagon::SA1_dec); - addOps(Result, Inst, 0); - addOps(Result, Inst, 1); - addOps(Result, Inst, 2); - break; - } // 1,2 SUBInst $Rd = add($Rs,#-1) - else if (Inst.getOperand(1).getReg() == Hexagon::R29) { - Result.setOpcode(Hexagon::SA1_addsp); - addOps(Result, Inst, 0); - addOps(Result, Inst, 2); - break; - } // 1,3 SUBInst $Rd = add(r29, #$u6_2) - else { - Result.setOpcode(Hexagon::SA1_addi); - addOps(Result, Inst, 0); - addOps(Result, Inst, 1); - addOps(Result, Inst, 2); - break; - } // 1,2,3 SUBInst $Rx = add($Rx, #$s7) + if (Absolute) { + if (Value == 1) { + Result.setOpcode(Hexagon::SA1_inc); + addOps(Result, Inst, 0); + addOps(Result, Inst, 1); + break; + } // 1,2 SUBInst $Rd = add($Rs, #1) + if (Value == -1) { + Result.setOpcode(Hexagon::SA1_dec); + addOps(Result, Inst, 0); + addOps(Result, Inst, 1); + addOps(Result, Inst, 2); + break; + } // 1,2 SUBInst $Rd = add($Rs,#-1) + if (Inst.getOperand(1).getReg() == Hexagon::R29) { + Result.setOpcode(Hexagon::SA1_addsp); + addOps(Result, Inst, 0); + addOps(Result, Inst, 2); + break; + } // 1,3 SUBInst $Rd = add(r29, #$u6_2) + } + Result.setOpcode(Hexagon::SA1_addi); + addOps(Result, Inst, 0); + addOps(Result, Inst, 1); + addOps(Result, Inst, 2); + break; // 1,2,3 SUBInst $Rx = add($Rx, #$s7) case Hexagon::A2_add: Result.setOpcode(Hexagon::SA1_addrx); addOps(Result, Inst, 0); diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index 0ef1401ef531a..c212726113ab7 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -49,8 +49,9 @@ public: LanaiAsmBackend(const Target &T, Triple::OSType OST) : MCAsmBackend(), OSType(OST) {} - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -88,9 +89,10 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } -void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned /*DataSize*/, uint64_t Value, - bool /*IsPCRel*/, MCContext & /*Ctx*/) const { +void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool /*IsPCRel*/) const { MCFixupKind Kind = Fixup.getKind(); Value = adjustFixupValue(static_cast<unsigned>(Kind), Value); diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index d8fdc8ba674e6..982c6fea62d44 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -32,16 +32,20 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { return *RM; } +static std::string computeDataLayout(const Triple &TT, StringRef CPU, + const TargetOptions &Options) { + return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"; +} + MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS, + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM, OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), - // FIXME: Check DataLayout string. Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 694c201cbe8dc..9d5c179a0fd90 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -322,6 +322,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseDirectiveSet(); bool parseDirectiveOption(); bool parseInsnDirective(); + bool parseRSectionDirective(StringRef Section); bool parseSSectionDirective(StringRef Section, unsigned Type); bool parseSetAtDirective(); @@ -5106,7 +5107,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) { CC = StringSwitch<unsigned>(Name) .Case("zero", 0) - .Case("at", 1) + .Cases("at", "AT", 1) .Case("a0", 4) .Case("a1", 5) .Case("a2", 6) @@ -6952,6 +6953,23 @@ bool MipsAsmParser::parseInsnDirective() { return false; } +/// parseRSectionDirective +/// ::= .rdata +bool MipsAsmParser::parseRSectionDirective(StringRef Section) { + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + MCSection *ELFSection = getContext().getELFSection( + Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + getParser().getStreamer().SwitchSection(ELFSection); + + getParser().Lex(); // Eat EndOfStatement token. + return false; +} + /// parseSSectionDirective /// ::= .sbss /// ::= .sdata @@ -7499,6 +7517,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { parseInsnDirective(); return false; } + if (IDVal == ".rdata") { + parseRSectionDirective(".rodata"); + return false; + } if (IDVal == ".sbss") { parseSSectionDirective(IDVal, ELF::SHT_NOBITS); return false; diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 6d3d4db036032..ae48d6e38fa0f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -235,10 +235,12 @@ static unsigned calculateMMLEIndex(unsigned i) { /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided /// data fragment, at the offset specified by the fixup and following the /// fixup kind as appropriate. -void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, bool IsPCRel, - MCContext &Ctx) const { +void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { MCFixupKind Kind = Fixup.getKind(); + MCContext &Ctx = Asm.getContext(); Value = adjustFixupValue(Fixup, Value, Ctx); if (!Value) diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 4b3cc6e21f4cd..bf3b290b7ed53 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -38,8 +38,9 @@ public: MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; Optional<MCFixupKind> getFixupKind(StringRef Name) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td index 3272319ad50f4..7daea163b8a64 100644 --- a/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/lib/Target/Mips/Mips32r6InstrInfo.td @@ -326,9 +326,9 @@ class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>; class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, InstrItinClass itin = NoItinerary> : MipsR6Arch<instr_asm> { - dag OutOperandList = (outs GPROpnd:$rs); - dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm); - string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm"); + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm"); list<dag> Pattern = []; InstrItinClass Itinerary = itin; } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 68708dc4f50fe..02102d6b22f4e 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -907,6 +907,11 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1)))) return SDValue(); } + // Don't generate INS if constant OR operand doesn't fit into bits + // cleared by constant AND operand. + if (CN->getSExtValue() & CN1->getSExtValue()) + return SDValue(); + SDLoc DL(N); EVT ValTy = N->getOperand(0)->getValueType(0); SDValue Const1; diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index 272595af5f6f1..b95f1158fa562 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -274,8 +274,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); MF->insert(FallThroughMBB, BalTgtMBB); - LongBrMBB->addSuccessor(BalTgtMBB, BranchProbability::getOne()); - BalTgtMBB->addSuccessor(&*FallThroughMBB, BranchProbability::getOne()); + LongBrMBB->addSuccessor(BalTgtMBB); + BalTgtMBB->addSuccessor(TgtMBB); // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an @@ -342,8 +342,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { .addReg(Mips::SP).addImm(8); if (Subtarget.hasMips32r6()) - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR), Mips::ZERO) - .addReg(Mips::AT); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR)) + .addReg(Mips::ZERO).addReg(Mips::AT); else BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT); @@ -415,8 +415,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { .addReg(Mips::SP_64).addImm(0); if (Subtarget.hasMips64r6()) - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64), Mips::ZERO_64) - .addReg(Mips::AT_64); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64)) + .addReg(Mips::ZERO_64).addReg(Mips::AT_64); else BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64); diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index 2382ea2716612..b57bceb3c8371 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -1257,19 +1257,22 @@ static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) { static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) { EVT ResVecTy = Op->getValueType(0); EVT ViaVecTy = ResVecTy; + bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian(); SDLoc DL(Op); // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating // lanes. - SDValue LaneA; - SDValue LaneB = Op->getOperand(2); + SDValue LaneA = Op->getOperand(OpNr); + SDValue LaneB; if (ResVecTy == MVT::v2i64) { - LaneA = DAG.getConstant(0, DL, MVT::i32); + LaneB = DAG.getConstant(0, DL, MVT::i32); ViaVecTy = MVT::v4i32; + if(BigEndian) + std::swap(LaneA, LaneB); } else - LaneA = LaneB; + LaneB = LaneA; SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB }; @@ -1277,8 +1280,11 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) { SDValue Result = DAG.getBuildVector( ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements())); - if (ViaVecTy != ResVecTy) - Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result); + if (ViaVecTy != ResVecTy) { + SDValue One = DAG.getConstant(1, DL, ViaVecTy); + Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, + DAG.getNode(ISD::AND, DL, ViaVecTy, Result, One)); + } return Result; } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 028c2cb562f8e..6d7eb786a6835 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -113,8 +113,9 @@ public: return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind]; } - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override { + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override { Value = adjustFixupValue(Fixup.getKind(), Value); if (!Value) return; // Doesn't change encoding. @@ -130,10 +131,8 @@ public: } } - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override { + void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, bool &IsResolved) override { switch ((PPC::Fixups)Fixup.getKind()) { default: break; case PPC::fixup_ppc_br24: diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index 38ae62b26757a..07c9c1f9f84c0 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -24,7 +24,6 @@ namespace llvm { class PPCTargetMachine; class PassRegistry; class FunctionPass; - class ImmutablePass; class MachineInstr; class AsmPrinter; class MCInst; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 57a1d373c88cf..c2c115cb6dafa 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -521,7 +521,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); bool HasBP = RegInfo->hasBasePointer(MF); unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg; - unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg; + unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 662550f7a396a..72f14e9691382 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2560,8 +2560,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_LO); - SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); + SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) + : DAG.getRegister(PPC::R2, MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); } @@ -8377,9 +8378,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. - bool is64bit = Subtarget.isPPC64(); - return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); + if (Subtarget.isPPC64()) + return DAG.getRegister(PPC::X13, MVT::i64); + return DAG.getRegister(PPC::R2, MVT::i32); } // If this is a lowered altivec predicate compare, CompareOpc is set to the diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 70536a6039b82..e2af5e5295445 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -972,13 +972,15 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src), // Support for medium and large code model. let hasSideEffects = 0 in { +let isReMaterializable = 1 in { def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDIStocHA", []>, isPPC64; +def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDItocL", []>, isPPC64; +} let mayLoad = 1 in def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), "#LDtocL", []>, isPPC64; -def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), - "#ADDItocL", []>, isPPC64; } // Support for thread-local storage. @@ -994,7 +996,7 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, isPPC64; -let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in +let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>; def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 236e513bec231..13b4f9ab962da 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -292,6 +292,29 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, return 0; } +// For opcodes with the ReMaterializable flag set, this function is called to +// verify the instruction is really rematable. +bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, + AliasAnalysis *AA) const { + switch (MI.getOpcode()) { + default: + // This function should only be called for opcodes with the ReMaterializable + // flag set. + llvm_unreachable("Unknown rematerializable operation!"); + break; + case PPC::LI: + case PPC::LI8: + case PPC::LIS: + case PPC::LIS8: + case PPC::QVGPCI: + case PPC::ADDIStocHA: + case PPC::ADDItocL: + case PPC::LOAD_STACK_GUARD: + return true; + } + return false; +} + unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { // Note: This list must be kept consistent with StoreRegToStackSlot. diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index 8dd4dbb608794..b0629c88cf57b 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -162,6 +162,8 @@ public: unsigned &SubIdx) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, + AliasAnalysis *AA) const override; unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 637e52bbdbeec..8af7f7e981171 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -389,9 +389,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { - BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) - .addReg(PPC::R31) - .addImm(FrameSize); + if (LP64) + BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg) + .addReg(PPC::X31) + .addImm(FrameSize); + else + BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) + .addReg(PPC::R31) + .addImm(FrameSize); } else if (LP64) { BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg) .addImm(0) @@ -478,8 +483,10 @@ void PPCRegisterInfo::lowerDynamicAreaOffset( const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); unsigned maxCallFrameSize = MFI.getMaxCallFrameSize(); + bool is64Bit = TM.isPPC64(); DebugLoc dl = MI.getDebugLoc(); - BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), + MI.getOperand(0).getReg()) .addImm(maxCallFrameSize); MBB.erase(II); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 5a226b23ff96f..a88a6541e8d00 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -86,9 +86,9 @@ EnableMachineCombinerPass("ppc-machine-combiner", extern "C" void LLVMInitializePowerPCTarget() { // Register the targets - RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target()); - RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target()); - RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget()); + RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target()); + RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target()); + RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); @@ -177,32 +177,34 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, assert(Options.MCOptions.getABIName().empty() && "Unknown target-abi option!"); - if (!TT.isMacOSX()) { - switch (TT.getArch()) { - case Triple::ppc64le: - return PPCTargetMachine::PPC_ABI_ELFv2; - case Triple::ppc64: - return PPCTargetMachine::PPC_ABI_ELFv1; - default: - // Fallthrough. - ; - } + if (TT.isMacOSX()) + return PPCTargetMachine::PPC_ABI_UNKNOWN; + + switch (TT.getArch()) { + case Triple::ppc64le: + return PPCTargetMachine::PPC_ABI_ELFv2; + case Triple::ppc64: + return PPCTargetMachine::PPC_ABI_ELFv1; + default: + return PPCTargetMachine::PPC_ABI_UNKNOWN; } - return PPCTargetMachine::PPC_ABI_UNKNOWN; } static Reloc::Model getEffectiveRelocModel(const Triple &TT, Optional<Reloc::Model> RM) { - if (!RM.hasValue()) { - if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) { - if (!TT.isOSBinFormatMachO() && !TT.isMacOSX()) - return Reloc::PIC_; - } - if (TT.isOSDarwin()) - return Reloc::DynamicNoPIC; - return Reloc::Static; - } - return *RM; + if (RM.hasValue()) + return *RM; + + // Darwin defaults to dynamic-no-pic. + if (TT.isOSDarwin()) + return Reloc::DynamicNoPIC; + + // Non-darwin 64-bit platforms are PIC by default. + if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) + return Reloc::PIC_; + + // 32-bit is static by default. + return Reloc::Static; } // The FeatureString here is a little subtle. We are modifying the feature @@ -224,26 +226,6 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, PPCTargetMachine::~PPCTargetMachine() = default; -void PPC32TargetMachine::anchor() {} - -PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} - -void PPC64TargetMachine::anchor() {} - -PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} - const PPCSubtarget * PPCTargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -406,7 +388,7 @@ void PPCPassConfig::addPreRegAlloc() { // FIXME: We probably don't need to run these for -fPIE. if (getPPCTargetMachine().isPositionIndependent()) { // FIXME: LiveVariables should not be necessary here! - // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on + // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on // LiveVariables. This (unnecessary) dependency has been removed now, // however a stage-2 clang build fails without LiveVariables computed here. addPass(&LiveVariablesID, false); diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index b8f5a2083d808..5eb6ba785d1b8 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -23,7 +23,7 @@ namespace llvm { /// Common code between 32-bit and 64-bit PowerPC targets. /// -class PPCTargetMachine : public LLVMTargetMachine { +class PPCTargetMachine final : public LLVMTargetMachine { public: enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 }; private: @@ -60,29 +60,6 @@ public: return false; } }; - -/// PowerPC 32-bit target machine. -/// -class PPC32TargetMachine : public PPCTargetMachine { - virtual void anchor(); -public: - PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - -/// PowerPC 64-bit target machine. -/// -class PPC64TargetMachine : public PPCTargetMachine { - virtual void anchor(); -public: - PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - } // end namespace llvm #endif diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index d9a71893afee7..f85c0cf111c43 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -32,8 +32,9 @@ public: : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {} ~RISCVAsmBackend() override {} - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -69,9 +70,10 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } -void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { return; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index d1d1334163a26..c72b47b090857 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -203,10 +203,8 @@ namespace { return InfosBE[Kind - FirstTargetFixupKind]; } - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override { + void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, bool &IsResolved) override { switch ((Sparc::Fixups)Fixup.getKind()) { default: break; case Sparc::fixup_sparc_wplt30: @@ -273,9 +271,9 @@ namespace { ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) : SparcAsmBackend(T), OSType(OSType) { } - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, - MCContext &Ctx) const override { + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override { Value = adjustFixupValue(Fixup.getKind(), Value); if (!Value) return; // Doesn't change encoding. diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp index 627e49a95f3cc..2c040dce994b6 100644 --- a/lib/Target/Sparc/SparcTargetObjectFile.cpp +++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp @@ -15,6 +15,12 @@ using namespace llvm; +void SparcELFTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, MachineModuleInfo *MMI, MCStreamer &Streamer) const { diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h index fe8800625a567..3b1b345c3b193 100644 --- a/lib/Target/Sparc/SparcTargetObjectFile.h +++ b/lib/Target/Sparc/SparcTargetObjectFile.h @@ -23,6 +23,8 @@ public: TargetLoweringObjectFileELF() {} + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + const MCExpr *getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index fd1fd7bc40dcc..6b32a7926437a 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -50,8 +50,9 @@ public: return SystemZ::NumTargetFixupKinds; } const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } @@ -89,15 +90,17 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { return Infos[Kind - FirstTargetFixupKind]; } -void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsPCRel) const { MCFixupKind Kind = Fixup.getKind(); unsigned Offset = Fixup.getOffset(); unsigned BitSize = getFixupKindInfo(Kind).TargetSize; unsigned Size = (BitSize + 7) / 8; - assert(Offset + Size <= DataSize && "Invalid fixup offset!"); + assert(Offset + Size <= Data.size() && "Invalid fixup offset!"); // Big-endian insertion of Size bytes. Value = extractBitsForFixup(Kind, Value); diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp index a28a91e834f61..0cb2b5a14ce73 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -277,8 +277,21 @@ void SystemZFrameLowering:: processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo &MFFrame = MF.getFrameInfo(); - uint64_t MaxReach = (MFFrame.estimateStackSize(MF) + - SystemZMC::CallFrameSize * 2); + // Get the size of our stack frame to be allocated ... + uint64_t StackSize = (MFFrame.estimateStackSize(MF) + + SystemZMC::CallFrameSize); + // ... and the maximum offset we may need to reach into the + // caller's frame to access the save area or stack arguments. + int64_t MaxArgOffset = SystemZMC::CallFrameSize; + for (int I = MFFrame.getObjectIndexBegin(); I != 0; ++I) + if (MFFrame.getObjectOffset(I) >= 0) { + int64_t ArgOffset = SystemZMC::CallFrameSize + + MFFrame.getObjectOffset(I) + + MFFrame.getObjectSize(I); + MaxArgOffset = std::max(MaxArgOffset, ArgOffset); + } + + uint64_t MaxReach = StackSize + MaxArgOffset; if (!isUInt<12>(MaxReach)) { // We may need register scavenging slots if some parts of the frame // are outside the reach of an unsigned 12-bit displacement. diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index ac4c3f6db684d..fef4a8c92a362 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1322,11 +1322,6 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps); } -SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad( - SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const { - return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain); -} - // Return true if Op is an intrinsic node with chain that returns the CC value // as its only (other) argument. Provide the associated SystemZISD opcode and // the mask of valid CC values if so. @@ -2059,6 +2054,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL, if (NewC.ICmpType != SystemZICMP::SignedOnly && NewC.Op0.getOpcode() == ISD::SHL && isSimpleShift(NewC.Op0, ShiftVal) && + (MaskVal >> ShiftVal != 0) && (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal >> ShiftVal, CmpVal >> ShiftVal, @@ -2068,6 +2064,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL, } else if (NewC.ICmpType != SystemZICMP::SignedOnly && NewC.Op0.getOpcode() == ISD::SRL && isSimpleShift(NewC.Op0, ShiftVal) && + (MaskVal << ShiftVal != 0) && (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal << ShiftVal, CmpVal << ShiftVal, @@ -3212,12 +3209,15 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op, return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0)); } -// Op is an atomic load. Lower it into a normal volatile load. +// Op is an atomic load. Lower it into a serialization followed +// by a normal volatile load. SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const { auto *Node = cast<AtomicSDNode>(Op.getNode()); + SDValue Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), + MVT::Other, Node->getChain()), 0); return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(), - Node->getChain(), Node->getBasePtr(), + Chain, Node->getBasePtr(), Node->getMemoryVT(), Node->getMemOperand()); } @@ -4688,7 +4688,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(STRCMP); OPCODE(SEARCH_STRING); OPCODE(IPM); - OPCODE(SERIALIZE); OPCODE(MEMBARRIER); OPCODE(TBEGIN); OPCODE(TBEGIN_NOFLOAT); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 79c8c4d92669f..5dcb19c0a35db 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -139,9 +139,6 @@ enum NodeType : unsigned { // Store the CC value in bits 29 and 28 of an integer. IPM, - // Perform a serialization operation. (BCR 15,0 or BCR 14,0.) - SERIALIZE, - // Compiler barrier only; generate a no-op. MEMBARRIER, @@ -471,8 +468,6 @@ public: const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; - SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL, - SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; ISD::NodeType getExtendForAtomicOps() const override { @@ -522,7 +517,6 @@ private: unsigned Opcode) const; SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index fa5ecdd852433..9f5e6288348e0 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -189,18 +189,15 @@ let isBranch = 1, isTerminator = 1 in { //===----------------------------------------------------------------------===// // Unconditional trap. -// FIXME: This trap instruction should be marked as isTerminator, but there is -// currently a general bug that allows non-terminators to be placed between -// terminators. Temporarily leave this unmarked until the bug is fixed. -let isBarrier = 1, hasCtrlDep = 1 in +let hasCtrlDep = 1 in def Trap : Alias<4, (outs), (ins), [(trap)]>; // Conditional trap. -let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in +let hasCtrlDep = 1, Uses = [CC] in def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>; // Fused compare-and-trap instructions. -let isTerminator = 1, hasCtrlDep = 1 in { +let hasCtrlDep = 1 in { // These patterns work the same way as for compare-and-branch. defm CRT : CmpBranchRRFcPair<"crt", 0xB972, GR32>; defm CGRT : CmpBranchRRFcPair<"cgrt", 0xB960, GR64>; @@ -1449,7 +1446,7 @@ let Predicates = [FeatureExecutionHint] in { // A serialization instruction that acts as a barrier for all memory // accesses, which expands to "bcr 14, 0". let hasSideEffects = 1 in -def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>; +def Serialize : Alias<2, (outs), (ins), []>; // A pseudo instruction that serves as a compiler barrier. let hasSideEffects = 1, hasNoSchedulingInfo = 1 in diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp index ab6020f3f1896..b6feaa49d8585 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -23,7 +23,7 @@ using namespace llvm; #ifndef NDEBUG // Print the set of SUs void SystemZPostRASchedStrategy::SUSet:: -dump(SystemZHazardRecognizer &HazardRec) { +dump(SystemZHazardRecognizer &HazardRec) const { dbgs() << "{"; for (auto &SU : *this) { HazardRec.dumpSU(SU, dbgs()); diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h index 12357e0348a9e..3dfef388691e7 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.h +++ b/lib/Target/SystemZ/SystemZMachineScheduler.h @@ -72,7 +72,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy { // A set of SUs with a sorter and dump method. struct SUSet : std::set<SUnit*, SUSorter> { #ifndef NDEBUG - void dump(SystemZHazardRecognizer &HazardRec); + void dump(SystemZHazardRecognizer &HazardRec) const; #endif }; diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index adfc69c5d4cf4..ab2392809f3be 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -191,8 +191,6 @@ def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>; def z_udivrem32 : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>; def z_udivrem64 : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>; -def z_serialize : SDNode<"SystemZISD::SERIALIZE", SDTNone, - [SDNPHasChain, SDNPMayStore]>; def z_membarrier : SDNode<"SystemZISD::MEMBARRIER", SDTNone, [SDNPHasChain, SDNPSideEffect]>; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index eb2f17a2091c3..a10ca64fa6329 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -51,8 +51,6 @@ public: } bool targetSchedulesPostRAScheduling() const override { return true; }; - - bool isMachineVerifierClean() const override { return false; } }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 4f20096c15830..1357cb5735f8a 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -37,8 +37,9 @@ public: : MCAsmBackend(), Is64Bit(Is64Bit) {} ~WebAssemblyAsmBackendELF() override {} - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -77,8 +78,9 @@ public: const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -105,9 +107,11 @@ bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count, return true; } -void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const { const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind()); assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags"); @@ -119,7 +123,7 @@ void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. @@ -163,9 +167,11 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count, return true; } -void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const { const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind()); assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags"); @@ -177,7 +183,7 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index c56c591def361..3e3b52fca5691 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -36,7 +36,6 @@ STATISTIC(MCNumFixups, "Number of MC fixups created."); namespace { class WebAssemblyMCCodeEmitter final : public MCCodeEmitter { const MCInstrInfo &MCII; - MCContext &Ctx; // Implementation generated by tablegen. uint64_t getBinaryCodeForInstr(const MCInst &MI, @@ -48,14 +47,12 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter { const MCSubtargetInfo &STI) const override; public: - WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), Ctx(ctx) {} + WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {} }; } // end anonymous namespace -MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, - MCContext &Ctx) { - return new WebAssemblyMCCodeEmitter(MCII, Ctx); +MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) { + return new WebAssemblyMCCodeEmitter(MCII); } void WebAssemblyMCCodeEmitter::encodeInstruction( @@ -89,11 +86,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) { encodeSLEB128(int64_t(MO.getImm()), OS); } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) { - Fixups.push_back(MCFixup::create( - OS.tell() - Start, MCConstantExpr::create(MO.getImm(), Ctx), - MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc())); - ++MCNumFixups; - encodeULEB128(uint64_t(MO.getImm()), OS); + llvm_unreachable("wasm globals should only be accessed symbolicly"); } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) { encodeSLEB128(int64_t(MO.getImm()), OS); } else { @@ -135,6 +128,9 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) { FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32); PaddedSize = 5; + } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) { + FixupKind = MCFixupKind(WebAssembly::fixup_code_global_index); + PaddedSize = 5; } else { llvm_unreachable("unexpected symbolic operand kind"); } diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 9fd3ec81c258f..9580eeaa33d73 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -74,7 +74,7 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/, static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo & /*MRI*/, MCContext &Ctx) { - return createWebAssemblyMCCodeEmitter(MCII, Ctx); + return createWebAssemblyMCCodeEmitter(MCII); } static MCAsmBackend *createAsmBackend(const Target & /*T*/, diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 0ba700a86b744..4d676c32a09c5 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -35,8 +35,7 @@ class raw_pwrite_stream; Target &getTheWebAssemblyTarget32(); Target &getTheWebAssemblyTarget64(); -MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, - MCContext &Ctx); +MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII); MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT); diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 19e14f3261aa7..9cf77829f3bc2 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -68,6 +68,8 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target, bool IsFunction = IsFunctionExpr(Fixup.getValue()); switch (unsigned(Fixup.getKind())) { + case WebAssembly::fixup_code_global_index: + return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB; case WebAssembly::fixup_code_sleb128_i32: if (IsFunction) return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB; diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index b999091e2d294..f51585a10ca12 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -96,13 +96,6 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { MCConstantExpr::create(Size, OutContext)); } } - - if (!TM.getTargetTriple().isOSBinFormatELF()) { - MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>(); - getTargetStreamer()->emitGlobal(MMIW.getGlobals()); - if (MMIW.hasStackPointerGlobal()) - getTargetStreamer()->emitStackPointer(MMIW.getStackPointerGlobal()); - } } void WebAssemblyAsmPrinter::EmitConstantPool() { diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 09338a4898e03..c980f4b87f916 100644 --- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -63,12 +63,16 @@ class WebAssemblyFastISel final : public FastISel { public: // Innocuous defaults for our address. Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; } - void setKind(BaseKind K) { Kind = K; } + void setKind(BaseKind K) { + assert(!isSet() && "Can't change kind with non-zero base"); + Kind = K; + } BaseKind getKind() const { return Kind; } bool isRegBase() const { return Kind == RegBase; } bool isFIBase() const { return Kind == FrameIndexBase; } void setReg(unsigned Reg) { assert(isRegBase() && "Invalid base register access!"); + assert(Base.Reg == 0 && "Overwriting non-zero register"); Base.Reg = Reg; } unsigned getReg() const { @@ -77,6 +81,7 @@ class WebAssemblyFastISel final : public FastISel { } void setFI(unsigned FI) { assert(isFIBase() && "Invalid base frame index access!"); + assert(Base.FI == 0 && "Overwriting non-zero frame index"); Base.FI = FI; } unsigned getFI() const { @@ -91,6 +96,13 @@ class WebAssemblyFastISel final : public FastISel { int64_t getOffset() const { return Offset; } void setGlobalValue(const GlobalValue *G) { GV = G; } const GlobalValue *getGlobalValue() const { return GV; } + bool isSet() const { + if (isRegBase()) { + return Base.Reg != 0; + } else { + return Base.FI != 0; + } + } }; /// Keep a pointer to the WebAssemblySubtarget around so that we can make the @@ -297,6 +309,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { DenseMap<const AllocaInst *, int>::iterator SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { + if (Addr.isSet()) { + return false; + } Addr.setKind(Address::FrameIndexBase); Addr.setFI(SI->second); return true; @@ -341,6 +356,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { break; } } + if (Addr.isSet()) { + return false; + } Addr.setReg(getRegForValue(Obj)); return Addr.getReg() != 0; } diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 4209bc333f230..a37d6136e44ed 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -104,10 +104,10 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF, const DebugLoc &DL) { const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + const char *ES = "__stack_pointer"; + auto *SPSymbol = MF.createExternalSymbolName(ES); if (MF.getSubtarget<WebAssemblySubtarget>() .getTargetTriple().isOSBinFormatELF()) { - const char *ES = "__stack_pointer"; - auto *SPSymbol = MF.createExternalSymbolName(ES); MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *PtrRC = MRI.getTargetRegisterInfo()->getPointerRegClass(MF); @@ -125,10 +125,8 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF, .addReg(SrcReg) .addMemOperand(MMO); } else { - MachineModuleInfoWasm &MMIW = - MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>(); BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32)) - .addImm(MMIW.getStackPointerGlobal()) + .addExternalSymbol(SPSymbol) .addReg(SrcReg); } } @@ -171,10 +169,11 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, unsigned SPReg = WebAssembly::SP32; if (StackSize) SPReg = MRI.createVirtualRegister(PtrRC); + + const char *ES = "__stack_pointer"; + auto *SPSymbol = MF.createExternalSymbolName(ES); if (MF.getSubtarget<WebAssemblySubtarget>() .getTargetTriple().isOSBinFormatELF()) { - const char *ES = "__stack_pointer"; - auto *SPSymbol = MF.createExternalSymbolName(ES); unsigned Zero = MRI.createVirtualRegister(PtrRC); BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero) @@ -189,22 +188,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, .addReg(Zero) // addr .addMemOperand(LoadMMO); } else { - auto &MMIW = MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>(); - if (!MMIW.hasStackPointerGlobal()) { - MMIW.setStackPointerGlobal(MMIW.getGlobals().size()); - - // Create the stack-pointer global. For now, just use the - // Emscripten/Binaryen ABI names. - wasm::Global G; - G.Type = wasm::ValType::I32; - G.Mutable = true; - G.InitialValue = 0; - G.InitialModule = "env"; - G.InitialName = "STACKTOP"; - MMIW.addGlobal(G); - } BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg) - .addImm(MMIW.getStackPointerGlobal()); + .addExternalSymbol(SPSymbol); } bool HasBP = hasBP(MF); diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 6650191807dcb..ea9e3fa862ce2 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -170,28 +170,16 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, if (MI.mayStore()) { Write = true; - const MachineFunction &MF = *MI.getParent()->getParent(); - if (MF.getSubtarget<WebAssemblySubtarget>() - .getTargetTriple().isOSBinFormatELF()) { - // Check for stores to __stack_pointer. - for (auto MMO : MI.memoperands()) { - const MachinePointerInfo &MPI = MMO->getPointerInfo(); - if (MPI.V.is<const PseudoSourceValue *>()) { - auto PSV = MPI.V.get<const PseudoSourceValue *>(); - if (const ExternalSymbolPseudoSourceValue *EPSV = - dyn_cast<ExternalSymbolPseudoSourceValue>(PSV)) - if (StringRef(EPSV->getSymbol()) == "__stack_pointer") - StackPointer = true; - } - } - } else { - // Check for sets of the stack pointer. - const MachineModuleInfoWasm &MMIW = - MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>(); - if ((MI.getOpcode() == WebAssembly::SET_LOCAL_I32 || - MI.getOpcode() == WebAssembly::SET_LOCAL_I64) && - MI.getOperand(0).getImm() == MMIW.getStackPointerGlobal()) { - StackPointer = true; + // Check for stores to __stack_pointer. + for (auto MMO : MI.memoperands()) { + const MachinePointerInfo &MPI = MMO->getPointerInfo(); + if (MPI.V.is<const PseudoSourceValue *>()) { + auto PSV = MPI.V.get<const PseudoSourceValue *>(); + if (const ExternalSymbolPseudoSourceValue *EPSV = + dyn_cast<ExternalSymbolPseudoSourceValue>(PSV)) + if (StringRef(EPSV->getSymbol()) == "__stack_pointer") { + StackPointer = true; + } } } } else if (MI.hasOrderedMemoryRef()) { diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index e5d3209ec6a97..d30cc724c203f 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1705,8 +1705,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); unsigned Len = DotDispStr.size(); - unsigned Val = OrigDispVal + DotDispVal; - InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val); + InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal); } NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext()); diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 7a9e4f4468ec7..914fb36f91a7d 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -108,12 +108,12 @@ public: return Infos[Kind - FirstTargetFixupKind]; } - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override { + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override { unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); - assert(Fixup.getOffset() + Size <= DataSize && - "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); // Check that uppper bits are either all zeros or all ones. // Specifically ignore overflow/underflow as long as the leakage is diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 4097ef224d503..caf98bffb80de 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -153,8 +153,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( const MCSymbol *B_Base = Asm.getAtom(*B); // Neither symbol can be modified. - if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { + if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) { Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of modified symbol"); return; diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 105580c913a16..5892f1de33eec 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -10,6 +10,7 @@ #include "MCTargetDesc/X86FixupKinds.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "llvm/BinaryFormat/COFF.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCValue.h" @@ -25,8 +26,8 @@ public: X86WinCOFFObjectWriter(bool Is64Bit); ~X86WinCOFFObjectWriter() override = default; - unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection, + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const override; }; @@ -36,11 +37,19 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit) : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64 : COFF::IMAGE_FILE_MACHINE_I386) {} -unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, +unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const { - unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind(); + unsigned FixupKind = Fixup.getKind(); + if (IsCrossSection) { + if (FixupKind != FK_Data_4) { + Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression"); + return COFF::IMAGE_REL_AMD64_ADDR32; + } + FixupKind = FK_PCRel_4; + } MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 2777fa89330f6..e3aa227702bea 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -748,17 +748,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, else CallOp = X86::CALLpcrel32; - const char *Symbol; - if (Is64Bit) { - if (STI.isTargetCygMing()) { - Symbol = "___chkstk_ms"; - } else { - Symbol = "__chkstk"; - } - } else if (STI.isTargetCygMing()) - Symbol = "_alloca"; - else - Symbol = "_chkstk"; + StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF); MachineInstrBuilder CI; MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); @@ -769,10 +759,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, // For the large code model, we have to call through a register. Use R11, // as it is scratch in all supported calling conventions. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) - .addExternalSymbol(Symbol); + .addExternalSymbol(MF.createExternalSymbolName(Symbol)); CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); } else { - CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)) + .addExternalSymbol(MF.createExternalSymbolName(Symbol)); } unsigned AX = Is64Bit ? X86::RAX : X86::EAX; @@ -783,13 +774,16 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addReg(SP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - if (Is64Bit) { + if (STI.isTargetWin64() || !STI.isOSWindows()) { + // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves. // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themselves. It also does not clobber %rax so we can reuse it when + // themselves. They also does not clobber %rax so we can reuse it when // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(X86::RAX); + // All other platforms do not specify a particular ABI for the stack probe + // function, so we arbitrarily define it to not adjust %esp/%rsp itself. + BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP) + .addReg(SP) + .addReg(AX); } if (InProlog) { @@ -978,7 +972,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + bool UseRedZone = false; + bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); // The default stack probe size is 4096 if the function has no stackprobesize // attribute. @@ -1007,6 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. + !UseStackProbe && // No stack probes. !IsWin64CC && // Win64 has no Red Zone !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack @@ -1015,6 +1011,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0); StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI.setStackSize(StackSize); + UseRedZone = true; } // Insert stack pointer adjustment for later moving of return addr. Only @@ -1192,6 +1189,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { + assert(!UseRedZone && "The Red Zone is not accounted for in stack probes"); + // Check whether EAX is livein for this block. bool isEAXAlive = isEAXLiveIn(MBB); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 2a1633de0a239..3c4589ab18f6f 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -204,6 +204,11 @@ namespace { bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + template <class GatherScatterSDNode> + bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); bool selectMOV64Imm32(SDValue N, SDValue &Imm); bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -1415,13 +1420,10 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } -bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, - SDValue &Scale, SDValue &Index, - SDValue &Disp, SDValue &Segment) { - - MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent); - if (!Mgs) - return false; +template <class GatherScatterSDNode> +bool X86DAGToDAGISel::selectAddrOfGatherScatterNode( + GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace(); // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. @@ -1453,6 +1455,18 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } +bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent)) + return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>( + Mgs, N, Base, Scale, Index, Disp, Segment); + if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent)) + return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>( + X86Gather, N, Base, Scale, Index, Disp, Segment); + return false; +} + /// Returns true if it is able to pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 172eba0002d4f..f777e56289884 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; + + // TODO: These control memcmp expansion in CGP and are set low to prevent + // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder. + MaxLoadsPerMemcmp = 1; + MaxLoadsPerMemcmpOptSize = 1; + // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); @@ -14272,9 +14278,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // If we are inserting a element, see if we can do this more efficiently with // a blend shuffle with a rematerializable vector than a costly integer // insertion. - // TODO: pre-SSE41 targets will tend to use bit masking - this could still - // be beneficial if we are inserting several zeros and can combine the masks. - if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) { + if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && + 16 <= EltVT.getSizeInBits()) { SmallVector<int, 8> BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); @@ -17621,23 +17626,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; - SDValue CmpOp0 = Cmp.getOperand(0); + // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && - (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { - SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, - DAG.getConstant(0, DL, - CmpOp0.getValueType()), - CmpOp0); - SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, DL, MVT::i8), - SDValue(Neg.getNode(), 1)); - return Res; - } + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { + SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); + SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0); + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, DL, MVT::i8), + SDValue(Neg.getNode(), 1)); + return Res; + } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); @@ -18648,8 +18651,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); + bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || - SplitStack; + SplitStack || EmitStackProbe; SDLoc dl(Op); // Get the inputs. @@ -23705,6 +23709,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SDValue RetOps[] = {Exract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } + if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { + // There is a special case when the return type is v2i32 is illegal and + // the type legaizer extended it to v2i64. Without this conversion we end up + // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. + // In order to avoid this situation, we'll build an X86 specific Gather node + // with index v2i64 and value type v4i32. + assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && + "Unexpected type in masked gather"); + Src0 = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Src0), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + // The mask should match the destination type. Extending mask with zeroes + // is not necessary since instruction itself reads only two values from + // memory. + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + + SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64, + NewGather.getValue(0), DAG); + SDValue RetOps[] = { Sext, NewGather.getValue(1) }; + return DAG.getMergeValues(RetOps, dl); + } + if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) { + // This transformation is for optimization only. + // The type legalizer extended mask and index to 4 elements vector + // in order to match requirements of the common gather node - same + // vector width of index and value. X86 Gather node allows mismatch + // of vector width in order to select more optimal instruction at the + // end. + assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && + "Unexpected type in masked gather"); + if (Mask.getOpcode() == ISD::CONCAT_VECTORS && + ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) && + Index.getOpcode() == ISD::CONCAT_VECTORS && + Index.getOperand(1).isUndef()) { + Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false); + Index = Index.getOperand(0); + } else + return Op; + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + + SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) }; + return DAG.getMergeValues(RetOps, dl); + + } return Op; } @@ -24508,6 +24563,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; + case X86ISD::MGATHER: return "X86ISD::MGATHER"; } return nullptr; } @@ -29868,7 +29924,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; - bool isFastMultiplier = false; + bool IsFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: @@ -29880,12 +29936,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) - isFastMultiplier = true; + IsFastMultiplier = true; break; } } - if (isFastMultiplier) { + if (IsFastMultiplier) { APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, @@ -34841,23 +34897,56 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); - // (cmp Z, 1) sets the carry flag if Z is 0. SDValue Z = Cmp.getOperand(0); - SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, - DAG.getConstant(1, DL, Z.getValueType())); + EVT ZVT = Z.getValueType(); + + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) { + // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with + // fake operands: + // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) + // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) + if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) || + (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) { + SDValue Zero = DAG.getConstant(0, DL, ZVT); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + SDValue(Neg.getNode(), 1)); + } + + // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' + // with fake operands: + // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) + // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) + if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) || + (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) { + SDValue One = DAG.getConstant(1, DL, ZVT); + SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1); + } + } - SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); + // (cmp Z, 1) sets the carry flag if Z is 0. + SDValue One = DAG.getConstant(1, DL, ZVT); + SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); + + // Add the flags type for ADC/SBB nodes. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, - DAG.getConstant(-1ULL, DL, VT), NewCmp); + DAG.getConstant(-1ULL, DL, VT), Cmp1); // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, - DAG.getConstant(0, DL, VT), NewCmp); + DAG.getConstant(0, DL, VT), Cmp1); } static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, @@ -34976,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); } +/// Convert vector increment or decrement to sub/add with an all-ones constant: +/// add X, <1, 1...> --> sub X, <-1, -1...> +/// sub X, <1, 1...> --> add X, <-1, -1...> +/// The all-ones vector constant can be materialized using a pcmpeq instruction +/// that is commonly recognized as an idiom (has no register dependency), so +/// that's better/smaller than loading a splat 1 constant. +static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB && + "Unexpected opcode for increment/decrement transform"); + + // Pseudo-legality check: getOnesVector() expects one of these types, so bail + // out and wait for legalization if we have an unsupported vector length. + EVT VT = N->getValueType(0); + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + SDNode *N1 = N->getOperand(1).getNode(); + APInt SplatVal; + if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue()) + return SDValue(); + + SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); + unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; + return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); +} + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const SDNodeFlags Flags = N->getFlags(); @@ -34995,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); + if (SDValue V = combineIncDecVector(N, DAG)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } @@ -35028,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); + if (SDValue V = combineIncDecVector(N, DAG)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } @@ -36335,3 +36456,22 @@ void X86TargetLowering::insertCopiesSplitCSR( bool X86TargetLowering::supportSwiftError() const { return Subtarget.is64Bit(); } + +/// Returns the name of the symbol used to emit stack probes or the empty +/// string if not applicable. +StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { + // If the function specifically requests stack probes, emit them. + if (MF.getFunction()->hasFnAttribute("probe-stack")) + return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString(); + + // Generally, if we aren't on Windows, the platform ABI does not include + // support for stack probes, so don't emit them. + if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO()) + return ""; + + // We need a stack probe to conform to the Windows ABI. Choose the right + // symbol. + if (Subtarget.is64Bit()) + return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; + return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index f51b6641db2fb..e1ade92979dc0 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -615,7 +615,10 @@ namespace llvm { // Vector truncating store with unsigned/signed saturation VTRUNCSTOREUS, VTRUNCSTORES, // Vector truncating masked store with unsigned/signed saturation - VMTRUNCSTOREUS, VMTRUNCSTORES + VMTRUNCSTOREUS, VMTRUNCSTORES, + + // X86 specific gather + MGATHER // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all @@ -1056,6 +1059,8 @@ namespace llvm { bool supportSwiftError() const override; + StringRef getStackProbeSymbolName(MachineFunction &MF) const override; + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } /// \brief Lower interleaved load(s) into target specific @@ -1065,6 +1070,12 @@ namespace llvm { ArrayRef<unsigned> Indices, unsigned Factor) const override; + /// \brief Lower interleaved store(s) into target specific + /// instructions/intrinsics. + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const override; + + void finalizeLowering(MachineFunction &MF) const override; protected: @@ -1397,6 +1408,19 @@ namespace llvm { } }; + // X86 specific Gather node. + class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode { + public: + X86MaskedGatherSDNode(unsigned Order, + const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO) + {} + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::MGATHER; + } + }; + } // end namespace llvm #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2620679df2517..01a70323224c3 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -7265,13 +7265,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { let Predicates = [HasAVX512] in { def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>; def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>; def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>; def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>; @@ -7281,13 +7281,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0x1))), _.FRC)>; + addr:$src, (i32 0x9))), _.FRC)>; def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0x2))), _.FRC)>; + addr:$src, (i32 0xa))), _.FRC)>; def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0x3))), _.FRC)>; + addr:$src, (i32 0xb))), _.FRC)>; def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), addr:$src, (i32 0x4))), _.FRC)>; @@ -7869,7 +7869,7 @@ let Predicates = [HasVLX] in { defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, vx128xmem, mgatherv4i32>, EVEX_V128; defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, - vx64xmem, mgatherv2i64>, EVEX_V128; + vx64xmem, X86mgatherv2i64>, EVEX_V128; } } @@ -8471,26 +8471,26 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, } let Predicates = [HasAVX512] in { def : Pat<(v16f32 (ffloor VR512:$src)), - (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>; + (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>; def : Pat<(v16f32 (fnearbyint VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; def : Pat<(v16f32 (fceil VR512:$src)), - (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>; + (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>; def : Pat<(v16f32 (frint VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; def : Pat<(v16f32 (ftrunc VR512:$src)), - (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>; + (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>; def : Pat<(v8f64 (ffloor VR512:$src)), - (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>; + (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>; def : Pat<(v8f64 (fnearbyint VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; def : Pat<(v8f64 (fceil VR512:$src)), - (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>; + (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>; def : Pat<(v8f64 (frint VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; def : Pat<(v8f64 (ftrunc VR512:$src)), - (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>; + (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>; } defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index c28b35b22977a..8b5bbf24f6f63 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -641,7 +641,7 @@ def sdmem : Operand<v2f64> { // SSE pattern fragments //===----------------------------------------------------------------------===// -// Vector load wrappers to prevent folding of non-temporal aligned loads on +// Vector load wrappers to prevent folding of non-temporal aligned loads on // supporting targets. def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() || @@ -754,16 +754,6 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; -// These are needed to match a scalar memop that is used in a vector-only -// math instruction such as the FP logical ops: andps, andnps, orps, xorps. -// The memory operand is required to be a 128-bit load, so it must be converted -// from a vector to a scalar. -def memopfsf32_128 : PatFrag<(ops node:$ptr), - (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>; -def memopfsf64_128 : PatFrag<(ops node:$ptr), - (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>; - - // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. // FIXME: 8 byte alignment for mmx reads is not required @@ -773,6 +763,9 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; +def X86masked_gather : SDNode<"X86ISD::MGATHER", SDTMaskedGather, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) @@ -796,6 +789,15 @@ def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), Mgt->getBasePtr().getValueType() == MVT::v2i64); return false; }]>; +def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64) && + (Mgt->getMemoryVT() == MVT::v2i32 || + Mgt->getMemoryVT() == MVT::v2f32); + return false; +}]>; def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8490b972eb5c1..fe87bbd994738 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1744,7 +1744,7 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG; } -def : Pat<(f32 (fpround FR64:$src)), +def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>, Requires<[UseAVX]>; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 77dead8d24137..f98c2a7e802dd 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -72,9 +72,24 @@ private: MachineFunction &MF) const; bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; - bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + + // emit insert subreg instruction and insert it before MachineInstr &I + bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I, + MachineRegisterInfo &MRI, MachineFunction &MF) const; + // emit extract subreg instruction and insert it before MachineInstr &I + bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I, + MachineRegisterInfo &MRI, MachineFunction &MF) const; + + const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const; + const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg, + MachineRegisterInfo &MRI) const; const X86TargetMachine &TM; const X86Subtarget &STI; @@ -113,8 +128,8 @@ X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM, // FIXME: This should be target-independent, inferred from the types declared // for each class in the bank. -static const TargetRegisterClass * -getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) { +const TargetRegisterClass * +X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const { if (RB.getID() == X86::GPRRegBankID) { if (Ty.getSizeInBits() <= 8) return &X86::GR8RegClass; @@ -127,13 +142,13 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) { } if (RB.getID() == X86::VECRRegBankID) { if (Ty.getSizeInBits() == 32) - return &X86::FR32XRegClass; + return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; if (Ty.getSizeInBits() == 64) - return &X86::FR64XRegClass; + return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; if (Ty.getSizeInBits() == 128) - return &X86::VR128XRegClass; + return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass; if (Ty.getSizeInBits() == 256) - return &X86::VR256XRegClass; + return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass; if (Ty.getSizeInBits() == 512) return &X86::VR512RegClass; } @@ -141,10 +156,16 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) { llvm_unreachable("Unknown RegBank!"); } +const TargetRegisterClass * +X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg, + MachineRegisterInfo &MRI) const { + const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI); + return getRegClass(Ty, RegBank); +} + // Set X86 Opcode and constrain DestReg. -static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { +bool X86InstructionSelector::selectCopy(MachineInstr &I, + MachineRegisterInfo &MRI) const { unsigned DstReg = I.getOperand(0).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { @@ -171,7 +192,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, switch (RegBank.getID()) { case X86::GPRRegBankID: assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values."); - RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank); + RC = getRegClass(MRI.getType(DstReg), RegBank); // Change the physical register if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) { @@ -186,7 +207,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, } break; case X86::VECRRegBankID: - RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank); + RC = getRegClass(MRI.getType(DstReg), RegBank); break; default: llvm_unreachable("Unknown RegBank!"); @@ -220,7 +241,7 @@ bool X86InstructionSelector::select(MachineInstr &I) const { // Certain non-generic instructions also need some special handling. if (I.isCopy()) - return selectCopy(I, TII, MRI, TRI, RBI); + return selectCopy(I, MRI); // TODO: handle more cases - LOAD_STACK_GUARD, PHI return true; @@ -249,6 +270,10 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectUadde(I, MRI, MF)) return true; + if (selectExtract(I, MRI, MF)) + return true; + if (selectInsert(I, MRI, MF)) + return true; return false; } @@ -326,6 +351,34 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, return Opc; } +// Fill in an address from the given instruction. +void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI, + X86AddressMode &AM) { + + assert(I.getOperand(0).isReg() && "unsupported opperand."); + assert(MRI.getType(I.getOperand(0).getReg()).isPointer() && + "unsupported type."); + + if (I.getOpcode() == TargetOpcode::G_GEP) { + if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) { + int64_t Imm = *COff; + if (isInt<32>(Imm)) { // Check for displacement overflow. + AM.Disp = static_cast<int32_t>(Imm); + AM.Base.Reg = I.getOperand(1).getReg(); + return; + } + } + } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) { + AM.Base.FrameIndex = I.getOperand(1).getIndex(); + AM.BaseType = X86AddressMode::FrameIndexBase; + return; + } + + // Default behavior. + AM.Base.Reg = I.getOperand(0).getReg(); + return; +} + bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { @@ -340,18 +393,28 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment()); if (NewOpc == Opc) return false; + X86AddressMode AM; + X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM); + I.setDesc(TII.get(NewOpc)); MachineInstrBuilder MIB(MF, I); - if (Opc == TargetOpcode::G_LOAD) - addOffset(MIB, 0); - else { + if (Opc == TargetOpcode::G_LOAD) { + I.RemoveOperand(1); + addFullAddress(MIB, AM); + } else { // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL) + I.RemoveOperand(1); I.RemoveOperand(0); - addOffset(MIB, 0).addUse(DefReg); + addFullAddress(MIB, AM).addUse(DefReg); } return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -461,11 +524,11 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, if (DstRB.getID() != X86::GPRRegBankID) return false; - const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); + const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB); if (!DstRC) return false; - const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); + const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB); if (!SrcRC) return false; @@ -519,9 +582,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, else return false; - const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); unsigned DefReg = - MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank)); + MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI)); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG), DefReg) @@ -656,6 +718,202 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I, return true; } +bool X86InstructionSelector::selectExtract(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + + if (I.getOpcode() != TargetOpcode::G_EXTRACT) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + int64_t Index = I.getOperand(2).getImm(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + + // Meanwile handle vector type only. + if (!DstTy.isVector()) + return false; + + if (Index % DstTy.getSizeInBits() != 0) + return false; // Not extract subvector. + + if (Index == 0) { + // Replace by extract subreg copy. + if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF)) + return false; + + I.eraseFromParent(); + return true; + } + + bool HasAVX = STI.hasAVX(); + bool HasAVX512 = STI.hasAVX512(); + bool HasVLX = STI.hasVLX(); + + if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) { + if (HasVLX) + I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr)); + else if (HasAVX) + I.setDesc(TII.get(X86::VEXTRACTF128rr)); + else + return false; + } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) { + if (DstTy.getSizeInBits() == 128) + I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr)); + else if (DstTy.getSizeInBits() == 256) + I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr)); + else + return false; + } else + return false; + + // Convert to X86 VEXTRACT immediate. + Index = Index / DstTy.getSizeInBits(); + I.getOperand(2).setImm(Index); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg, + MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + unsigned SubIdx = X86::NoSubRegister; + + if (!DstTy.isVector() || !SrcTy.isVector()) + return false; + + assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() && + "Incorrect Src/Dst register size"); + + if (DstTy.getSizeInBits() == 128) + SubIdx = X86::sub_xmm; + else if (DstTy.getSizeInBits() == 256) + SubIdx = X86::sub_ymm; + else + return false; + + const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI); + const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI); + + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx); + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg) + .addReg(SrcReg, 0, SubIdx); + + return true; +} + +bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg, + MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + unsigned SubIdx = X86::NoSubRegister; + + // TODO: support scalar types + if (!DstTy.isVector() || !SrcTy.isVector()) + return false; + + assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() && + "Incorrect Src/Dst register size"); + + if (SrcTy.getSizeInBits() == 128) + SubIdx = X86::sub_xmm; + else if (SrcTy.getSizeInBits() == 256) + SubIdx = X86::sub_ymm; + else + return false; + + const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI); + const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI); + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n"); + return false; + } + + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY)) + .addReg(DstReg, RegState::DefineNoRead, SubIdx) + .addReg(SrcReg); + + return true; +} + +bool X86InstructionSelector::selectInsert(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + + if (I.getOpcode() != TargetOpcode::G_INSERT) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + const unsigned InsertReg = I.getOperand(2).getReg(); + int64_t Index = I.getOperand(3).getImm(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT InsertRegTy = MRI.getType(InsertReg); + + // Meanwile handle vector type only. + if (!DstTy.isVector()) + return false; + + if (Index % InsertRegTy.getSizeInBits() != 0) + return false; // Not insert subvector. + + if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) { + // Replace by subreg copy. + if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF)) + return false; + + I.eraseFromParent(); + return true; + } + + bool HasAVX = STI.hasAVX(); + bool HasAVX512 = STI.hasAVX512(); + bool HasVLX = STI.hasVLX(); + + if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) { + if (HasVLX) + I.setDesc(TII.get(X86::VINSERTF32x4Z256rr)); + else if (HasAVX) + I.setDesc(TII.get(X86::VINSERTF128rr)); + else + return false; + } else if (DstTy.getSizeInBits() == 512 && HasAVX512) { + if (InsertRegTy.getSizeInBits() == 128) + I.setDesc(TII.get(X86::VINSERTF32x4Zrr)); + else if (InsertRegTy.getSizeInBits() == 256) + I.setDesc(TII.get(X86::VINSERTF64x4Zrr)); + else + return false; + } else + return false; + + // Convert to X86 VINSERT immediate. + Index = Index / InsertRegTy.getSizeInBits(); + + I.getOperand(3).setImm(Index); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp index 806d6cc888f0f..f0ed4bc16e2f9 100644 --- a/lib/Target/X86/X86InterleavedAccess.cpp +++ b/lib/Target/X86/X86InterleavedAccess.cpp @@ -16,6 +16,7 @@ #include "X86ISelLowering.h" #include "X86TargetMachine.h" +#include "llvm/Analysis/VectorUtils.h" using namespace llvm; @@ -50,9 +51,8 @@ class X86InterleavedAccessGroup { IRBuilder<> &Builder; /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors - /// sub vectors of type \p T. Returns true and the sub-vectors in - /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise. - bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T, + /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors. + void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T, SmallVectorImpl<Instruction *> &DecomposedVectors); /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and @@ -80,8 +80,7 @@ public: /// target information \p STarget. explicit X86InterleavedAccessGroup(Instruction *I, ArrayRef<ShuffleVectorInst *> Shuffs, - ArrayRef<unsigned> Ind, - const unsigned F, + ArrayRef<unsigned> Ind, const unsigned F, const X86Subtarget &STarget, IRBuilder<> &B) : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget), @@ -102,48 +101,61 @@ bool X86InterleavedAccessGroup::isSupported() const { uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); - if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize) - return false; + // Currently, lowering is supported for 4-element vectors of 64 bits on AVX. + uint64_t ExpectedShuffleVecSize; + if (isa<LoadInst>(Inst)) + ExpectedShuffleVecSize = 256; + else + ExpectedShuffleVecSize = 1024; - // Currently, lowering is supported for 64 bits on AVX. - if (!Subtarget.hasAVX() || ShuffleVecSize != 256 || + if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize || DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4) return false; return true; } -bool X86InterleavedAccessGroup::decompose( +void X86InterleavedAccessGroup::decompose( Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, SmallVectorImpl<Instruction *> &DecomposedVectors) { + + assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && + "Expected Load or Shuffle"); + Type *VecTy = VecInst->getType(); (void)VecTy; assert(VecTy->isVectorTy() && DL.getTypeSizeInBits(VecTy) >= DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && "Invalid Inst-size!!!"); - assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() && - "Element type mismatched!!!"); - if (!isa<LoadInst>(VecInst)) - return false; + if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) { + Value *Op0 = SVI->getOperand(0); + Value *Op1 = SVI->getOperand(1); + + // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type. + for (unsigned i = 0; i < NumSubVectors; ++i) + DecomposedVectors.push_back( + cast<ShuffleVectorInst>(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Indices[i], + SubVecTy->getVectorNumElements(), 0)))); + return; + } + // Decompose the load instruction. LoadInst *LI = cast<LoadInst>(VecInst); Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); - Value *VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); - // Generate N loads of T type + // Generate N loads of T type. for (unsigned i = 0; i < NumSubVectors; i++) { - // TODO: Support inbounds GEP + // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment()); DecomposedVectors.push_back(NewLoad); } - - return true; } void X86InterleavedAccessGroup::transpose_4x4( @@ -181,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4( // instructions/intrinsics. bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { SmallVector<Instruction *, 4> DecomposedVectors; - VectorType *VecTy = Shuffles[0]->getType(); - // Try to generate target-sized register(/instruction). - if (!decompose(Inst, Factor, VecTy, DecomposedVectors)) - return false; - SmallVector<Value *, 4> TransposedVectors; - // Perform matrix-transposition in order to compute interleaved - // results by generating some sort of (optimized) target-specific - // instructions. + VectorType *ShuffleTy = Shuffles[0]->getType(); + + if (isa<LoadInst>(Inst)) { + // Try to generate target-sized register(/instruction). + decompose(Inst, Factor, ShuffleTy, DecomposedVectors); + + // Perform matrix-transposition in order to compute interleaved + // results by generating some sort of (optimized) target-specific + // instructions. + transpose_4x4(DecomposedVectors, TransposedVectors); + + // Now replace the unoptimized-interleaved-vectors with the + // transposed-interleaved vectors. + for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) + Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); + + return true; + } + + Type *ShuffleEltTy = ShuffleTy->getVectorElementType(); + unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor; + + // Lower the interleaved stores: + // 1. Decompose the interleaved wide shuffle into individual shuffle + // vectors. + decompose(Shuffles[0], Factor, + VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors); + + // 2. Transpose the interleaved-vectors into vectors of contiguous + // elements. transpose_4x4(DecomposedVectors, TransposedVectors); - // Now replace the unoptimized-interleaved-vectors with the - // transposed-interleaved vectors. - for (unsigned i = 0; i < Shuffles.size(); i++) - Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); + // 3. Concatenate the contiguous-vectors back into a wide vector. + Value *WideVec = concatenateVectors(Builder, TransposedVectors); + + // 4. Generate a store instruction for wide-vec. + StoreInst *SI = cast<StoreInst>(Inst); + Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), + SI->getAlignment()); return true; } @@ -220,3 +257,29 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } + +bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + + assert(SVI->getType()->getVectorNumElements() % Factor == 0 && + "Invalid interleaved store"); + + // Holds the indices of SVI that correspond to the starting index of each + // interleaved shuffle. + SmallVector<unsigned, 4> Indices; + auto Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) + Indices.push_back(Mask[i]); + + ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI); + + // Create an interleaved access group. + IRBuilder<> Builder(SI); + X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget, + Builder); + + return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); +} diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index bc73bb1ae8c51..6b1add8ff8ed1 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -510,12 +510,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, @@ -524,16 +518,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND), - X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCCM, X86ISD::FSETCCM_RND), X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCCM, X86ISD::FSETCCM_RND), - X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, @@ -1171,18 +1159,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSUBS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FSUBS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 979aaee110aa4..a584eabcc1b28 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -214,12 +214,24 @@ void X86LegalizerInfo::setLegalizerInfoAVX() { if (!Subtarget.hasAVX()) return; + const LLT v16s8 = LLT::vector(16, 8); + const LLT v8s16 = LLT::vector(8, 16); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); + + const LLT v32s8 = LLT::vector(32, 8); + const LLT v16s16 = LLT::vector(16, 16); const LLT v8s32 = LLT::vector(8, 32); const LLT v4s64 = LLT::vector(4, 64); for (unsigned MemOp : {G_LOAD, G_STORE}) for (auto Ty : {v8s32, v4s64}) setAction({MemOp, Ty}, Legal); + + for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) + setAction({G_INSERT, Ty}, Legal); + for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) + setAction({G_INSERT, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfoAVX2() { @@ -243,6 +255,18 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { if (!Subtarget.hasAVX512()) return; + const LLT v16s8 = LLT::vector(16, 8); + const LLT v8s16 = LLT::vector(8, 16); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); + + const LLT v32s8 = LLT::vector(32, 8); + const LLT v16s16 = LLT::vector(16, 16); + const LLT v8s32 = LLT::vector(8, 32); + const LLT v4s64 = LLT::vector(4, 64); + + const LLT v64s8 = LLT::vector(64, 8); + const LLT v32s16 = LLT::vector(32, 16); const LLT v16s32 = LLT::vector(16, 32); const LLT v8s64 = LLT::vector(8, 64); @@ -256,13 +280,15 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { for (auto Ty : {v16s32, v8s64}) setAction({MemOp, Ty}, Legal); + for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) + setAction({G_INSERT, Ty}, Legal); + for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) + setAction({G_INSERT, 1, Ty}, Legal); + /************ VLX *******************/ if (!Subtarget.hasVLX()) return; - const LLT v4s32 = LLT::vector(4, 32); - const LLT v8s32 = LLT::vector(8, 32); - for (auto Ty : {v4s32, v8s32}) setAction({G_MUL, Ty}, Legal); } diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp index dd21e2b7c4a13..8fdf10617059a 100644 --- a/lib/Target/X86/X86MacroFusion.cpp +++ b/lib/Target/X86/X86MacroFusion.cpp @@ -2,39 +2,31 @@ // // The LLVM Compiler Infrastructure // -// \file This file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // -// This file contains the X86 implementation of the DAG scheduling mutation to -// pair instructions back to back. +/// \file This file contains the X86 implementation of the DAG scheduling +/// mutation to pair instructions back to back. // //===----------------------------------------------------------------------===// #include "X86MacroFusion.h" #include "X86Subtarget.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" - -#define DEBUG_TYPE "misched" - -STATISTIC(NumFused, "Number of instr pairs fused"); +#include "llvm/CodeGen/MacroFusion.h" using namespace llvm; -static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - -namespace { - -/// \brief Verify that the instruction pair, First and Second, -/// should be scheduled back to back. If either instruction is unspecified, -/// then verify that the other instruction may be part of a pair at all. -static bool shouldScheduleAdjacent(const X86Subtarget &ST, - const MachineInstr *First, - const MachineInstr *Second) { +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI); // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -47,13 +39,10 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST, FuseInc } FuseKind; - assert((First || Second) && "At least one instr must be specified"); - unsigned FirstOpcode = First - ? First->getOpcode() + unsigned FirstOpcode = FirstMI + ? FirstMI->getOpcode() : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); - unsigned SecondOpcode = Second - ? Second->getOpcode() - : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); + unsigned SecondOpcode = SecondMI.getOpcode(); switch (SecondOpcode) { default: @@ -203,69 +192,11 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST, } } -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class X86MacroFusion : public ScheduleDAGMutation { -public: - X86MacroFusion() {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; - -void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>(); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch)) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!shouldScheduleAdjacent(ST, &Pred, Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - for (SDep &SuccDep : SU.Succs) - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - - ++NumFused; - DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; - SU.print(dbgs(), DAG); - dbgs() << " - ExitSU" - << " / " << DAG->TII->getName(Pred.getOpcode()) << " - " - << DAG->TII->getName(Branch->getOpcode()) << '\n';); - - break; - } -} - -} // end namespace - namespace llvm { std::unique_ptr<ScheduleDAGMutation> createX86MacroFusionDAGMutation () { - return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr; + return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent); } } // end namespace llvm diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h index e630f802e8e63..13fa2d78a0185 100644 --- a/lib/Target/X86/X86MacroFusion.h +++ b/lib/Target/X86/X86MacroFusion.h @@ -2,23 +2,18 @@ // // The LLVM Compiler Infrastructure // -// \file This file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // -// This file contains the X86 definition of the DAG scheduling mutation to pair -// instructions back to back. +/// \file This file contains the X86 definition of the DAG scheduling mutation +/// to pair instructions back to back. // //===----------------------------------------------------------------------===// -#include "X86InstrInfo.h" #include "llvm/CodeGen/MachineScheduler.h" -//===----------------------------------------------------------------------===// -// X86MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - namespace llvm { /// Note that you have to add: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 278b57eb00b74..a9f42cacf7886 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -91,6 +91,8 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return llvm::make_unique<X86FreeBSDTargetObjectFile>(); if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) return llvm::make_unique<X86LinuxNaClTargetObjectFile>(); + if (TT.isOSSolaris()) + return llvm::make_unique<X86SolarisTargetObjectFile>(); if (TT.isOSFuchsia()) return llvm::make_unique<X86FuchsiaTargetObjectFile>(); if (TT.isOSBinFormatELF()) diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 4fd95717478e9..8627c06d44313 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -86,6 +86,12 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, InitializeELF(TM.Options.UseInitArray); } +void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference( const GlobalValue *LHS, const GlobalValue *RHS, const TargetMachine &TM) const { diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 39d2e84e5ed77..f6aa570b6332a 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -66,6 +66,11 @@ namespace llvm { void Initialize(MCContext &Ctx, const TargetMachine &TM) override; }; + /// \brief This implementation is used for Solaris on x86/x86-64. + class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + /// \brief This implementation is used for Windows targets on x86 and x86-64. class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF { const MCExpr * diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 11ba7025e1b73..5ba8534d32d33 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2178,17 +2178,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } -bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, - TargetTransformInfo::LSRCost &C2) { - // X86 specific here are "instruction number 1st priority". - return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, - C1.NumIVMuls, C1.NumBaseAdds, - C1.ScaleCost, C1.ImmCost, C1.SetupCost) < - std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, - C2.NumIVMuls, C2.NumBaseAdds, - C2.ScaleCost, C2.ImmCost, C2.SetupCost); -} - bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa<PointerType>(ScalarTy) ? @@ -2243,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, return (CallerBits & CalleeBits) == CalleeBits; } +bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + // TODO: We can increase these based on available vector ops. + MaxLoadSize = ST->is64Bit() ? 8 : 4; + return true; +} + bool X86TTIImpl::enableInterleavedAccessVectorization() { // TODO: We expect this to be beneficial regardless of arch, // but there are currently some unexplained performance artifacts on Atom. @@ -2250,6 +2245,114 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() { return !(ST->isAtom()); } +// Get estimation for interleaved load/store operations for AVX2. +// \p Factor is the interleaved-access factor (stride) - number of +// (interleaved) elements in the group. +// \p Indices contains the indices for a strided load: when the +// interleaved load has gaps they indicate which elements are used. +// If Indices is empty (or if the number of indices is equal to the size +// of the interleaved-access as given in \p Factor) the access has no gaps. +// +// As opposed to AVX-512, AVX2 does not have generic shuffles that allow +// computing the cost using a generic formula as a function of generic +// shuffles. We therefore use a lookup table instead, filled according to +// the instruction sequences that codegen currently generates. +int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + + // We currently Support only fully-interleaved groups, with no gaps. + // TODO: Support also strided loads (interleaved-groups with gaps). + if (Indices.size() && Indices.size() != Factor) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + + // VecTy for interleave memop is <VF*Factor x Elt>. + // So, for VF=4, Interleave Factor = 3, Element type = i32 we have + // VecTy = <12 x i32>. + MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + + // This function can be called with VecTy=<6xi128>, Factor=3, in which case + // the VF=2, while v2i128 is an unsupported MVT vector type + // (see MachineValueType.h::getVectorVT()). + if (!LegalVT.isVector()) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + + unsigned VF = VecTy->getVectorNumElements() / Factor; + Type *ScalarTy = VecTy->getVectorElementType(); + + // Calculate the number of memory operations (NumOfMemOps), required + // for load/store the VecTy. + unsigned VecTySize = DL.getTypeStoreSize(VecTy); + unsigned LegalVTSize = LegalVT.getStoreSize(); + unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; + + // Get the cost of one memory operation. + Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), + LegalVT.getVectorNumElements()); + unsigned MemOpCost = + getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + + VectorType *VT = VectorType::get(ScalarTy, VF); + EVT ETy = TLI->getValueType(DL, VT); + if (!ETy.isSimple()) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + + // TODO: Complete for other data-types and strides. + // Each combination of Stride, ElementTy and VF results in a different + // sequence; The cost tables are therefore accessed with: + // Factor (stride) and VectorType=VFxElemType. + // The Cost accounts only for the shuffle sequence; + // The cost of the loads/stores is accounted for separately. + // + static const CostTblEntry AVX2InterleavedLoadTbl[] = { + { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 + { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 + { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 + { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8 + { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8 + + { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 + { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 + { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 + { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 + { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8 + }; + + static const CostTblEntry AVX2InterleavedStoreTbl[] = { + { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) + { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) + { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) + { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store) + { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store) + + { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) + { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) + { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store) + { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store) + { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store) + }; + + if (Opcode == Instruction::Load) { + if (const auto *Entry = + CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) + return NumOfMemOps * MemOpCost + Entry->Cost; + } else { + assert(Opcode == Instruction::Store && + "Expected Store Instruction at this point"); + if (const auto *Entry = + CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) + return NumOfMemOps * MemOpCost + Entry->Cost; + } + + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} + // Get estimation for interleaved load/store operations and strided load. // \p Indices contains indices for strided load. // \p Factor - the factor of interleaving. @@ -2358,6 +2461,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); + if (ST->hasAVX2()) + return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 09ce2c90498d9..ad0a0a2113012 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -93,6 +93,9 @@ public: int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef<unsigned> Indices, + unsigned Alignment, unsigned AddressSpace); int getIntImmCost(int64_t); @@ -101,15 +104,13 @@ public: int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); - bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, - TargetTransformInfo::LSRCost &C2); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, |