summaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-06-26 20:32:52 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-06-26 20:32:52 +0000
commit08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (patch)
tree80108f0f128657f8623f8f66ad9735b4d88e7b47 /lib/Target
parent7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (diff)
Notes
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.h2
-rw-r--r--lib/Target/AArch64/AArch64CondBrTuning.cpp336
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp47
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp15
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp11
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td46
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp6
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h38
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp20
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.cpp183
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.h11
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp4
-rw-r--r--lib/Target/AArch64/AArch64SchedA57.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td421
-rw-r--r--lib/Target/AArch64/AArch64SchedKryoDetails.td6
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp7
-rw-r--r--lib/Target/AArch64/CMakeLists.txt1
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp70
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td47
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp23
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h25
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td10
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp5
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h25
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp22
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h6
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp151
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td411
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp100
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h10
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td70
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp34
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h8
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp48
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp64
-rw-r--r--lib/Target/AMDGPU/Processors.td4
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp24
-rw-r--r--lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp30
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp12
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp83
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp261
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp71
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td195
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp11
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h14
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp100
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp40
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h7
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp2
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp21
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td16
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td68
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td28
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td88
-rw-r--r--lib/Target/ARM/ARM.td3
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp3
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp8
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp54
-rw-r--r--lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td10
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp105
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp6
-rw-r--r--lib/Target/ARM/ARMMacroFusion.cpp57
-rw-r--r--lib/Target/ARM/ARMMacroFusion.h24
-rw-r--r--lib/Target/ARM/ARMRegisterBankInfo.cpp10
-rw-r--r--lib/Target/ARM/ARMSubtarget.h8
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp7
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp66
-rw-r--r--lib/Target/ARM/CMakeLists.txt1
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp63
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h14
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp7
-rw-r--r--lib/Target/BPF/BPFISelDAGToDAG.cpp30
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp12
-rw-r--r--lib/Target/Hexagon/HexagonExpandCondsets.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp61
-rw-r--r--lib/Target/Hexagon/HexagonGenMux.cpp86
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp10
-rw-r--r--lib/Target/Hexagon/HexagonNewValueJump.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonPeephole.cpp54
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.cpp32
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.h22
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.cpp1
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp18
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp53
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp12
-rw-r--r--lib/Target/MSP430/MSP430TargetMachine.cpp8
-rw-r--r--lib/Target/Mips/AsmParser/MipsAsmParser.cpp24
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp8
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h5
-rw-r--r--lib/Target/Mips/Mips32r6InstrInfo.td6
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp5
-rw-r--r--lib/Target/Mips/MipsLongBranch.cpp12
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp18
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp11
-rw-r--r--lib/Target/PowerPC/PPC.h1
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp2
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp11
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td8
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp23
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h2
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp15
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.cpp72
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.h25
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp12
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp12
-rw-r--r--lib/Target/Sparc/SparcTargetObjectFile.cpp6
-rw-r--r--lib/Target/Sparc/SparcTargetObjectFile.h2
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp15
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.cpp17
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp15
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h6
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td11
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.h2
-rw-r--r--lib/Target/SystemZ/SystemZOperators.td2
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.h2
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp30
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp18
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp2
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h3
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFastISel.cpp20
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp29
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegStackify.cpp32
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp17
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp37
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp28
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp190
-rw-r--r--lib/Target/X86/X86ISelLowering.h26
-rw-r--r--lib/Target/X86/X86InstrAVX512.td26
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td24
-rw-r--r--lib/Target/X86/X86InstrSSE.td2
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp300
-rw-r--r--lib/Target/X86/X86InterleavedAccess.cpp125
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h24
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp32
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp101
-rw-r--r--lib/Target/X86/X86MacroFusion.h11
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp6
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h5
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp129
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h7
157 files changed, 4121 insertions, 1766 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 3e0e3978b90b5..37b9690d0434a 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -31,6 +31,7 @@ class MachineFunctionPass;
FunctionPass *createAArch64DeadRegisterDefinitions();
FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64CondBrTuning();
FunctionPass *createAArch64ConditionalCompares();
FunctionPass *createAArch64AdvSIMDScalar();
FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -55,6 +56,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64CondBrTuningPass(PassRegistry &);
void initializeAArch64ConditionalComparesPass(PassRegistry&);
void initializeAArch64ConditionOptimizerPass(PassRegistry&);
void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
new file mode 100644
index 0000000000000..f27bc97ec3f3e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -0,0 +1,336 @@
+//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions
+/// into a conditional branch (B.cond), when the NZCV flags can be set for
+/// "free". This is preferred on targets that have more flexibility when
+/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming
+/// all other variables are equal). This can also reduce register pressure.
+///
+/// A few examples:
+///
+/// 1) add w8, w0, w1 -> cmn w0, w1 ; CMN is an alias of ADDS.
+/// cbz w8, .LBB_2 -> b.eq .LBB0_2
+///
+/// 2) add w8, w0, w1 -> adds w8, w0, w1 ; w8 has multiple uses.
+/// cbz w8, .LBB1_2 -> b.eq .LBB1_2
+///
+/// 3) sub w8, w0, w1 -> subs w8, w0, w1 ; w8 has multiple uses.
+/// tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-cond-br-tuning"
+#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning"
+
+namespace {
+class AArch64CondBrTuning : public MachineFunctionPass {
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ MachineRegisterInfo *MRI;
+
+public:
+ static char ID;
+ AArch64CondBrTuning() : MachineFunctionPass(ID) {
+ initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; }
+
+private:
+ MachineInstr *getOperandDef(const MachineOperand &MO);
+ MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting);
+ MachineInstr *convertToCondBr(MachineInstr &MI);
+ bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI);
+};
+} // end anonymous namespace
+
+char AArch64CondBrTuning::ID = 0;
+
+INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning",
+ AARCH64_CONDBR_TUNING_NAME, false, false)
+
+void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
+ if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ return nullptr;
+ return MRI->getUniqueVRegDef(MO.getReg());
+}
+
+MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
+ bool IsFlagSetting) {
+ // If this is already the flag setting version of the instruction (e.g., SUBS)
+ // just make sure the implicit-def of NZCV isn't marked dead.
+ if (IsFlagSetting) {
+ for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands();
+ I != E; ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV)
+ MO.setIsDead(false);
+ }
+ return &MI;
+ }
+ bool Is64Bit;
+ unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
+ unsigned NewDestReg = MI.getOperand(0).getReg();
+ if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
+ NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(NewOpc), NewDestReg);
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+
+ return MIB;
+}
+
+MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
+ AArch64CC::CondCode CC;
+ MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI);
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ CC = AArch64CC::NE;
+ break;
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ CC = AArch64CC::GE;
+ break;
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ CC = AArch64CC::LT;
+ break;
+ }
+ return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TargetMBB);
+}
+
+bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
+ MachineInstr &DefMI) {
+ // We don't want NZCV bits live across blocks.
+ if (MI.getParent() != DefMI.getParent())
+ return false;
+
+ bool IsFlagSetting = true;
+ unsigned MIOpc = MI.getOpcode();
+ MachineInstr *NewCmp = nullptr, *NewBr = nullptr;
+ switch (DefMI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWri:
+ case AArch64::ADDWrr:
+ case AArch64::ADDWrs:
+ case AArch64::ADDWrx:
+ case AArch64::ANDWri:
+ case AArch64::ANDWrr:
+ case AArch64::ANDWrs:
+ case AArch64::BICWrr:
+ case AArch64::BICWrs:
+ case AArch64::SUBWri:
+ case AArch64::SUBWrr:
+ case AArch64::SUBWrs:
+ case AArch64::SUBWrx:
+ IsFlagSetting = false;
+ case AArch64::ADDSWri:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSWrx:
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSWrs:
+ case AArch64::BICSWrr:
+ case AArch64::BICSWrs:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSWrx:
+ switch (MIOpc) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::TBZW:
+ case AArch64::TBNZW:
+ // Check to see if the TBZ/TBNZ is checking the sign bit.
+ if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) &&
+ MI.getOperand(1).getImm() != 31)
+ return false;
+
+ // There must not be any instruction between DefMI and MI that clobbers or
+ // reads NZCV.
+ MachineBasicBlock::iterator I(DefMI), E(MI);
+ for (I = std::next(I); I != E; ++I) {
+ if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+ I->readsRegister(AArch64::NZCV, TRI))
+ return false;
+ }
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(DefMI.print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(MI.print(dbgs()));
+
+ NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+ NewBr = convertToCondBr(MI);
+ break;
+ }
+ break;
+
+ case AArch64::ADDXri:
+ case AArch64::ADDXrr:
+ case AArch64::ADDXrs:
+ case AArch64::ADDXrx:
+ case AArch64::ANDXri:
+ case AArch64::ANDXrr:
+ case AArch64::ANDXrs:
+ case AArch64::BICXrr:
+ case AArch64::BICXrs:
+ case AArch64::SUBXri:
+ case AArch64::SUBXrr:
+ case AArch64::SUBXrs:
+ case AArch64::SUBXrx:
+ IsFlagSetting = false;
+ case AArch64::ADDSXri:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXrs:
+ case AArch64::ADDSXrx:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::ANDSXrs:
+ case AArch64::BICSXrr:
+ case AArch64::BICSXrs:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXrs:
+ case AArch64::SUBSXrx:
+ switch (MIOpc) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ case AArch64::TBZX:
+ case AArch64::TBNZX: {
+ // Check to see if the TBZ/TBNZ is checking the sign bit.
+ if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) &&
+ MI.getOperand(1).getImm() != 63)
+ return false;
+ // There must not be any instruction between DefMI and MI that clobbers or
+ // reads NZCV.
+ MachineBasicBlock::iterator I(DefMI), E(MI);
+ for (I = std::next(I); I != E; ++I) {
+ if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+ I->readsRegister(AArch64::NZCV, TRI))
+ return false;
+ }
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(DefMI.print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(MI.print(dbgs()));
+
+ NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+ NewBr = convertToCondBr(MI);
+ break;
+ }
+ }
+ break;
+ }
+ assert(NewCmp && NewBr && "Expected new instructions.");
+
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(NewCmp->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(NewBr->print(dbgs()));
+
+ // If this was a flag setting version of the instruction, we use the original
+ // instruction by just clearing the dead marked on the implicit-def of NCZV.
+ // Therefore, we should not erase this instruction.
+ if (!IsFlagSetting)
+ DefMI.eraseFromParent();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+
+ TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI = MF.getSubtarget().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ bool LocalChange = false;
+ for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(),
+ E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ MachineInstr *DefMI = getOperandDef(MI.getOperand(0));
+ LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI));
+ break;
+ }
+ // If the optimization was successful, we can't optimize any other
+ // branches because doing so would clobber the NZCV flags.
+ if (LocalChange) {
+ Changed = true;
+ break;
+ }
+ }
+ }
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64CondBrTuning() {
+ return new AArch64CondBrTuning();
+}
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 544f67433fd53..ee54550c9900b 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -13,7 +13,9 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -84,6 +86,51 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n");
continue;
}
+ if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) {
+ // XZ/WZ for LSE can only be used when acquire semantics are not used,
+ // LDOPAL WZ is an invalid opcode.
+ switch (MI.getOpcode()) {
+ case AArch64::CASALb:
+ case AArch64::CASALh:
+ case AArch64::CASALs:
+ case AArch64::CASALd:
+ case AArch64::SWPALb:
+ case AArch64::SWPALh:
+ case AArch64::SWPALs:
+ case AArch64::SWPALd:
+ case AArch64::LDADDALb:
+ case AArch64::LDADDALh:
+ case AArch64::LDADDALs:
+ case AArch64::LDADDALd:
+ case AArch64::LDEORALb:
+ case AArch64::LDEORALh:
+ case AArch64::LDEORALs:
+ case AArch64::LDEORALd:
+ case AArch64::LDSETALb:
+ case AArch64::LDSETALh:
+ case AArch64::LDSETALs:
+ case AArch64::LDSETALd:
+ case AArch64::LDSMINALb:
+ case AArch64::LDSMINALh:
+ case AArch64::LDSMINALs:
+ case AArch64::LDSMINALd:
+ case AArch64::LDSMAXALb:
+ case AArch64::LDSMAXALh:
+ case AArch64::LDSMAXALs:
+ case AArch64::LDSMAXALd:
+ case AArch64::LDUMINALb:
+ case AArch64::LDUMINALh:
+ case AArch64::LDUMINALs:
+ case AArch64::LDUMINALd:
+ case AArch64::LDUMAXALb:
+ case AArch64::LDUMAXALh:
+ case AArch64::LDUMAXALs:
+ case AArch64::LDUMAXALd:
+ continue;
+ default:
+ break;
+ }
+ }
const MCInstrDesc &Desc = MI.getDesc();
for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
MachineOperand &MO = MI.getOperand(I);
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 8c2c0a564c302..04687847c1a30 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -201,7 +201,7 @@ private:
bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
- void SelectCMP_SWAP(SDNode *N);
+ bool SelectCMP_SWAP(SDNode *N);
};
} // end anonymous namespace
@@ -2609,9 +2609,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
}
/// We've got special pseudo-instructions for these
-void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
unsigned Opcode;
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+
+ // Leave IR for LSE if subtarget supports it.
+ if (Subtarget->hasLSE()) return false;
+
if (MemTy == MVT::i8)
Opcode = AArch64::CMP_SWAP_8;
else if (MemTy == MVT::i16)
@@ -2637,6 +2641,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
CurDAG->RemoveDeadNode(N);
+
+ return true;
}
void AArch64DAGToDAGISel::Select(SDNode *Node) {
@@ -2660,8 +2666,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
case ISD::ATOMIC_CMP_SWAP:
- SelectCMP_SWAP(Node);
- return;
+ if (SelectCMP_SWAP(Node))
+ return;
+ break;
case ISD::READ_REGISTER:
if (tryReadRegister(Node))
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 083ca2156598f..2965106fd2708 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10563,11 +10563,20 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
- return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+ if (Size > 128) return AtomicExpansionKind::None;
+ // Nand not supported in LSE.
+ if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
+ // Currently leaving And and Sub to LLSC
+ if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub))
+ return AtomicExpansionKind::LLSC;
+ // Leave 128 bits to LLSC.
+ return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
}
bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
+ // If subtarget has LSE, leave cmpxchg intact for codegen.
+ if (Subtarget->hasLSE()) return false;
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 71826bec6b11f..de283b70210fe 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -405,3 +405,49 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
(ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
GPR64:$newLo, GPR64:$newHi), []>,
Sched<[WriteAtomic]>;
+
+// v8.1 Atomic instructions:
+def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index eea012382150c..314e89bbca863 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1036,7 +1036,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
/// \brief Return the opcode that does not set flags when possible - otherwise
/// return the original opcode. The caller is responsible to do the actual
/// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
// Don't convert all compare instructions, because for some the zero register
// encoding becomes the sp register.
bool MIDefinesZeroReg = false;
@@ -1145,7 +1145,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
return true;
}
unsigned Opc = CmpInstr.getOpcode();
- unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+ unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
if (NewOpc == Opc)
return false;
const MCInstrDesc &MCID = get(NewOpc);
@@ -3318,7 +3318,7 @@ static bool getMaddPatterns(MachineInstr &Root,
// When NZCV is live bail out.
if (Cmp_NZCV == -1)
return false;
- unsigned NewOpc = convertFlagSettingOpcode(Root);
+ unsigned NewOpc = convertToNonFlagSettingOpc(Root);
// When opcode can't change bail out.
// CHECKME: do we miss any cases for opcode conversion?
if (NewOpc == Opc)
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 59f3405fe439a..58e9ce583d44c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -119,6 +119,44 @@ public:
}
}
+ /// \brief Return the opcode that set flags when possible. The caller is
+ /// responsible for ensuring the opc has a flag setting equivalent.
+ static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no flag setting equivalent!");
+ // 32-bit cases:
+ case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri;
+ case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr;
+ case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs;
+ case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx;
+ case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri;
+ case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr;
+ case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs;
+ case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr;
+ case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs;
+ case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri;
+ case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr;
+ case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs;
+ case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx;
+ // 64-bit cases:
+ case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri;
+ case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr;
+ case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs;
+ case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx;
+ case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri;
+ case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr;
+ case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs;
+ case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr;
+ case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs;
+ case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri;
+ case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr;
+ case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs;
+ case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx;
+ }
+ }
+
+
/// Return true if this is a load/store that can be potentially paired/merged.
bool isCandidateToMergeOrPair(MachineInstr &MI) const;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 9243eb91cc1ac..005f2d51e4036 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -795,6 +795,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
int LoadSize = getMemScale(*LoadI);
int StoreSize = getMemScale(*StoreI);
unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+ const MachineOperand &StMO = getLdStRegOp(*StoreI);
unsigned StRt = getLdStRegOp(*StoreI).getReg();
bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
@@ -807,7 +808,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
// Remove the load, if the destination register of the loads is the same
// register for stored value.
if (StRt == LdRt && LoadSize == 8) {
- StoreI->clearRegisterKills(StRt, TRI);
+ for (MachineInstr &MI : make_range(StoreI->getIterator(),
+ LoadI->getIterator())) {
+ if (MI.killsRegister(StRt, TRI)) {
+ MI.clearRegisterKills(StRt, TRI);
+ break;
+ }
+ }
DEBUG(dbgs() << "Remove load instruction:\n ");
DEBUG(LoadI->print(dbgs()));
DEBUG(dbgs() << "\n");
@@ -819,7 +826,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
.addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
- .addReg(StRt)
+ .add(StMO)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else {
// FIXME: Currently we disable this transformation in big-endian targets as
@@ -860,14 +867,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
DestReg)
- .addReg(StRt)
+ .add(StMO)
.addImm(AndMaskEncoded);
} else {
BitExtMI =
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
DestReg)
- .addReg(StRt)
+ .add(StMO)
.addImm(Immr)
.addImm(Imms);
}
@@ -876,7 +883,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
// Clear kill flags between store and load.
for (MachineInstr &MI : make_range(StoreI->getIterator(),
BitExtMI->getIterator()))
- MI.clearRegisterKills(StRt, TRI);
+ if (MI.killsRegister(StRt, TRI)) {
+ MI.clearRegisterKills(StRt, TRI);
+ break;
+ }
DEBUG(dbgs() << "Promoting load by replacing :\n ");
DEBUG(StoreI->print(dbgs()));
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 3b71d529db59b..ccc9d2ad1b482 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -7,37 +7,27 @@
//
//===----------------------------------------------------------------------===//
//
-// \file This file contains the AArch64 implementation of the DAG scheduling mutation
-// to pair instructions back to back.
+/// \file This file contains the AArch64 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
//
//===----------------------------------------------------------------------===//
#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/Target/TargetInstrInfo.h"
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
-
using namespace llvm;
-static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
- cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
namespace {
-/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
-/// together. Given an anchor instr, when the other instr is unspecified, then
-/// check if the anchor instr may be part of a fused pair at all.
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
const TargetSubtargetInfo &TSI,
const MachineInstr *FirstMI,
- const MachineInstr *SecondMI) {
- assert((FirstMI || SecondMI) && "At least one instr must be specified");
-
+ const MachineInstr &SecondMI) {
const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
@@ -45,9 +35,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
unsigned FirstOpcode =
FirstMI ? FirstMI->getOpcode()
: static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
- unsigned SecondOpcode =
- SecondMI ? SecondMI->getOpcode()
- : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
if (ST.hasArithmeticBccFusion())
// Fuse CMN, CMP, TST followed by Bcc.
@@ -128,158 +116,49 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
if (ST.hasFuseAES())
// Fuse AES crypto operations.
- switch(FirstOpcode) {
+ switch(SecondOpcode) {
// AES encode.
- case AArch64::AESErr:
- return SecondOpcode == AArch64::AESMCrr ||
- SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+ case AArch64::AESMCrr :
+ return FirstOpcode == AArch64::AESErr ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
// AES decode.
- case AArch64::AESDrr:
- return SecondOpcode == AArch64::AESIMCrr ||
- SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+ case AArch64::AESIMCrr:
+ return FirstOpcode == AArch64::AESDrr ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
}
if (ST.hasFuseLiterals())
// Fuse literal generation operations.
- switch (FirstOpcode) {
+ switch (SecondOpcode) {
// PC relative address.
- case AArch64::ADRP:
- return SecondOpcode == AArch64::ADDXri ||
- SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+ case AArch64::ADDXri:
+ return FirstOpcode == AArch64::ADRP ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
// 32 bit immediate.
- case AArch64::MOVZWi:
- return (SecondOpcode == AArch64::MOVKWi &&
- SecondMI->getOperand(3).getImm() == 16) ||
- SecondOpcode == AArch64::INSTRUCTION_LIST_END;
- // Lower half of 64 bit immediate.
- case AArch64::MOVZXi:
- return (SecondOpcode == AArch64::MOVKXi &&
- SecondMI->getOperand(3).getImm() == 16) ||
- SecondOpcode == AArch64::INSTRUCTION_LIST_END;
- // Upper half of 64 bit immediate.
+ case AArch64::MOVKWi:
+ return (FirstOpcode == AArch64::MOVZWi &&
+ SecondMI.getOperand(3).getImm() == 16) ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+ // Lower and upper half of 64 bit immediate.
case AArch64::MOVKXi:
- return FirstMI->getOperand(3).getImm() == 32 &&
- ((SecondOpcode == AArch64::MOVKXi &&
- SecondMI->getOperand(3).getImm() == 48) ||
- SecondOpcode == AArch64::INSTRUCTION_LIST_END);
+ return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ (FirstOpcode == AArch64::MOVZXi &&
+ SecondMI.getOperand(3).getImm() == 16) ||
+ (FirstOpcode == AArch64::MOVKXi &&
+ FirstMI->getOperand(3).getImm() == 32 &&
+ SecondMI.getOperand(3).getImm() == 48);
}
return false;
}
-/// \brief Implement the fusion of instr pairs in the scheduling DAG,
-/// anchored at the instr in AnchorSU..
-static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
- const MachineInstr *AnchorMI = AnchorSU.getInstr();
- if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient())
- return false;
-
- // If the anchor instr is the ExitSU, then consider its predecessors;
- // otherwise, its successors.
- bool Preds = (&AnchorSU == &DAG->ExitSU);
- SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;
-
- const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
- const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;
-
- // Check if the anchor instr may be fused.
- if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
- FirstMI, SecondMI))
- return false;
-
- // Explorer for fusion candidates among the dependencies of the anchor instr.
- for (SDep &Dep : AnchorDeps) {
- // Ignore dependencies that don't enforce ordering.
- if (Dep.isWeak())
- continue;
-
- SUnit &DepSU = *Dep.getSUnit();
- // Ignore the ExitSU if the dependents are successors.
- if (!Preds && &DepSU == &DAG->ExitSU)
- continue;
-
- const MachineInstr *DepMI = DepSU.getInstr();
- if (!DepMI || DepMI->isPseudo() || DepMI->isTransient())
- continue;
-
- FirstMI = Preds ? DepMI : AnchorMI;
- SecondMI = Preds ? AnchorMI : DepMI;
- if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
- FirstMI, SecondMI))
- continue;
-
- // Create a single weak edge between the adjacent instrs. The only effect is
- // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
- SUnit &FirstSU = Preds ? DepSU : AnchorSU;
- SUnit &SecondSU = Preds ? AnchorSU : DepSU;
- DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
-
- // Adjust the latency between the anchor instr and its
- // predecessors/successors.
- for (SDep &IDep : AnchorDeps)
- if (IDep.getSUnit() == &DepSU)
- IDep.setLatency(0);
-
- // Adjust the latency between the dependent instr and its
- // successors/predecessors.
- for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
- if (IDep.getSUnit() == &AnchorSU)
- IDep.setLatency(0);
-
- DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
- FirstSU.print(dbgs(), DAG); dbgs() << " - ";
- SecondSU.print(dbgs(), DAG); dbgs() << " / ";
- dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
- DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
-
- if (&SecondSU != &DAG->ExitSU)
- // Make instructions dependent on FirstSU also dependent on SecondSU to
- // prevent them from being scheduled between FirstSU and and SecondSU.
- for (SUnit::const_succ_iterator
- SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end();
- SI != SE; ++SI) {
- if (!SI->getSUnit() || SI->getSUnit() == &SecondSU)
- continue;
- DEBUG(dbgs() << " Copy Succ ";
- SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';);
- DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial));
- }
-
- ++NumFused;
- return true;
- }
-
- return false;
-}
-
-/// \brief Post-process the DAG to create cluster edges between instrs that may
-/// be fused by the processor into a single operation.
-class AArch64MacroFusion : public ScheduleDAGMutation {
-public:
- AArch64MacroFusion() {}
-
- void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
- ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
- // For each of the SUnits in the scheduling block, try to fuse the instr in it
- // with one in its successors.
- for (SUnit &ISU : DAG->SUnits)
- scheduleAdjacentImpl(DAG, ISU);
-
- // Try to fuse the instr in the ExitSU with one in its predecessors.
- scheduleAdjacentImpl(DAG, DAG->ExitSU);
-}
-
} // end namespace
namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
- return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
+ return createMacroFusionDAGMutation(shouldScheduleAdjacent);
}
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h
index e5efedd9fbfd9..32d90d4c40d6f 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.h
+++ b/lib/Target/AArch64/AArch64MacroFusion.h
@@ -2,23 +2,18 @@
//
// The LLVM Compiler Infrastructure
//
-// \fileThis file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
-// This file contains the AArch64 definition of the DAG scheduling mutation
-// to pair instructions back to back.
+/// \file This file contains the AArch64 definition of the DAG scheduling
+/// mutation to pair instructions back to back.
//
//===----------------------------------------------------------------------===//
-#include "AArch64InstrInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
-//===----------------------------------------------------------------------===//
-// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
namespace llvm {
/// Note that you have to add:
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 9b3899e0681cf..69124dbd0f838 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -469,10 +469,6 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
/*NumOperands*/ 2);
}
- case TargetOpcode::G_SEQUENCE:
- // FIXME: support this, but the generic code is really not going to do
- // anything sane.
- return getInvalidInstructionMapping();
default:
break;
}
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 303398ea0b7f3..5d1608ef04afa 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -13,7 +13,7 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// The Cortex-A57 is a traditional superscaler microprocessor with a
+// The Cortex-A57 is a traditional superscalar microprocessor with a
// conservative 3-wide in-order stage for decode and dispatch. Combined with the
// much wider out-of-order issue stage, this produced a need to carefully
// schedule micro-ops so that all three decoded each cycle are successfully
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 3d737402022d8..0aeb1f3e30584 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -32,8 +32,8 @@
//===----------------------------------------------------------------------===//
// Define 0 micro-op types
-def FalkorWr_none_1cyc : SchedWriteRes<[]> {
- let Latency = 1;
+def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
+ let Latency = 3;
let NumMicroOps = 0;
}
def FalkorWr_none_3cyc : SchedWriteRes<[]> {
@@ -505,7 +505,8 @@ def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
let NumMicroOps = 12;
}
-// Forwarding logic is modeled for multiply add/accumulate.
+// Forwarding logic is modeled for multiply add/accumulate and
+// load/store base register increment.
// -----------------------------------------------------------------------------
def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
@@ -513,9 +514,13 @@ def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr
def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
+def FalkorReadIncLd : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncSt : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+
// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
// -----------------------------------------------------------------------------
-def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
+def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).isImm() &&
+ MI->getOperand(1).getImm() == 0}]>;
def FalkorOp1ZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
MI->getOperand(1).getReg() == AArch64::XZR}]>;
@@ -770,84 +775,113 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
// SIMD Load Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instrs LD2i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instrs LD2i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD3i64_POST)>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD4i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instrs LD3Threev2d_POST)>;
-def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instrs LD4Fourv2d_POST)>;
-def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "^LD3Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc],
- (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc],
- (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD3Threev(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc],
- (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
-
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc],
- (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instrs LD2i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD3i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD4i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+ (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+ (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+ (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
// Arithmetic and Logical Instructions
// -----------------------------------------------------------------------------
@@ -929,87 +963,105 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>;
// SIMD Store Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRVro], (instregex "^STR(D|S|H|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STPQ(post|pre)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STP(D|S)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRQro], (instregex "^STRQro(W|X)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>;
-
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
+ (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instrs STNPQi)>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
- (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
-
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4(i8|i16|i32|i64)$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST3(i8|i16|i32|i64)_POST$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST4(i8|i16|i32|i64)_POST$")>;
-def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc],
- (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3Three(v8b|v4h|v2s)$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc],
- (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>;
-def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instrs ST3Threev2d)>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
(instrs ST3Threev2d_POST)>;
-def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc],
- (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4Four(v8b|v4h|v2s)$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc],
- (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>;
-def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instrs ST4Fourv2d)>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
(instrs ST4Fourv2d_POST)>;
-def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST3Three(v16b|v8h|v4s)$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
-def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST4Four(v16b|v8h|v4s)$")>;
// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
// Branch Instructions
@@ -1033,22 +1085,25 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>;
// FP Load Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(Q|D|S|H|B)i$")>;
-def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+ (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instrs LDNPQi)>;
-def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instrs LDPQi)>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "LDNP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDPQ(pre|post)$")>;
// FP Data Processing Instructions
@@ -1106,31 +1161,41 @@ def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>;
def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>;
-def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDNP(W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
(instregex "^LDP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
(instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(W|X)l$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+ (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDUR(BB|HH|W|X)i$")>;
def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
(instrs LDPSWi)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
(instregex "^LDPSW(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc],
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
-def : InstRW<[FalkorWr_LDRSro], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc], (instrs LDRSWl)>;
-def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
-def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
+ (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
// Miscellaneous Data-Processing Instructions
// -----------------------------------------------------------------------------
@@ -1178,32 +1243,46 @@ def : InstRW<[FalkorWr_1none_0cyc], (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HL
def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>;
def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>;
def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>;
def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>;
def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instrs STNPWi, STNPXi)>;
def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>;
def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>;
-
-def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STXP(W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STLXP(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STLXR(B|H|W|X)$")>;
// Store Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STP(W|X)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
(instregex "^STP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
(instregex "^STR(BB|HH|W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRro], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STUR(BB|HH|W|X)i$")>;
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 02cccccd3078c..cf4cdabb8cbfc 100644
--- a/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -1374,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln :
let Latency = 3; let NumMicroOps = 2;
}
def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
- (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+ (instregex "LDAX?R(B|H|W|X)")>;
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi],
+ (instregex "LDAXP(W|X)")>;
def KryoWrite_3cyc_LS_LS_401ln :
SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
let Latency = 3; let NumMicroOps = 2;
@@ -1565,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln :
SchedWriteRes<[KryoUnitLS]> {
let Latency = 3; let NumMicroOps = 1;
}
-def : InstRW<[KryoWrite_3cyc_LS_258ln],
+def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi],
(instregex "LDXP(W|X)")>;
def KryoWrite_3cyc_LS_258_1ln :
SchedWriteRes<[KryoUnitLS]> {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d4a8cecdb29f1..6660f0babb8a6 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -47,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
cl::desc("Enable the CCMP formation pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableCondBrTuning("aarch64-enable-cond-br-tune",
+ cl::desc("Enable the conditional branch tuning pass"),
+ cl::init(true), cl::Hidden);
+
static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
@@ -429,6 +434,8 @@ bool AArch64PassConfig::addILPOpts() {
addPass(createAArch64ConditionalCompares());
if (EnableMCR)
addPass(&MachineCombinerID);
+ if (EnableCondBrTuning)
+ addPass(createAArch64CondBrTuning());
if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);
if (EnableStPairSuppress)
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f0f50f29be0f3..02b12b5e90ca2 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(AArch64CodeGen
AArch64AsmPrinter.cpp
AArch64CleanupLocalDynamicTLSPass.cpp
AArch64CollectLOH.cpp
+ AArch64CondBrTuning.cpp
AArch64ConditionalCompares.cpp
AArch64DeadRegisterDefinitionsPass.cpp
AArch64ExpandPseudoInsts.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 43a6fa9ce0896..3d075018904c0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -43,26 +43,25 @@ public:
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
- // This table *must* be in the order that the fixup_* kinds are defined in
- // AArch64FixupKinds.h.
- //
- // Name Offset (bits) Size (bits) Flags
- { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
- { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
- { "fixup_aarch64_add_imm12", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
- { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
- { "fixup_aarch64_movw", 5, 16, 0 },
- { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
- { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
- { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
- { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
- { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
- };
+ // This table *must* be in the order that the fixup_* kinds are defined
+ // in AArch64FixupKinds.h.
+ //
+ // Name Offset (bits) Size (bits) Flags
+ {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal},
+ {"fixup_aarch64_add_imm12", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0},
+ {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
+ {"fixup_aarch64_movw", 5, 16, 0},
+ {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
+ {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -72,8 +71,9 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
bool mayNeedRelaxation(const MCInst &Inst) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -261,13 +261,15 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
}
}
-void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
if (!Value)
return; // Doesn't change encoding.
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+ MCContext &Ctx = Asm.getContext();
// Apply any target-specific value adjustments.
Value = adjustFixupValue(Fixup, Value, Ctx);
@@ -275,7 +277,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
Value <<= Info.TargetOffset;
unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -289,7 +291,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
}
} else {
// Handle as big-endian
- assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+ assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = FulleSizeInBytes - 1 - i;
@@ -539,16 +541,14 @@ public:
return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
}
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override;
+ void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, bool &IsResolved) override;
};
-void ELFAArch64AsmBackend::processFixupValue(
- const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
- const MCFragment *DF, const MCValue &Target, uint64_t &Value,
- bool &IsResolved) {
+void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target,
+ bool &IsResolved) {
// The ADRP instruction adds some multiple of 0x1000 to the current PC &
// ~0xfff. This means that the required offset to reach a symbol can vary by
// up to one step depending on where the ADRP is in memory. For example:
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 6ab2b9ef04598..7494e5decd6f6 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa",
"Support SDWA (Sub-DWORD Addressing) extension"
>;
+def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
+ "HasSDWAOmod",
+ "true",
+ "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
+ "HasSDWAScalar",
+ "true",
+ "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
+ "HasSDWASdst",
+ "true",
+ "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
+ "HasSDWAMac",
+ "true",
+ "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
+ "HasSDWAClampVOPC",
+ "true",
+ "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
def FeatureDPP : SubtargetFeature<"dpp",
"HasDPP",
"true",
@@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
- FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
- FeatureDPP
+ FeatureScalarStores, FeatureInv2PiInlineImm,
+ FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
]
>;
@@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
+ FeatureFastFMAF32, FeatureDPP,
+ FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
]
>;
@@ -449,14 +480,14 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
[FeatureSouthernIslands,
- FeatureFastFMAF32,
+ FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32]>;
def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
[FeatureSouthernIslands,
FeatureLDSBankCount32]>;
-
+
def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
[FeatureSeaIslands,
FeatureLDSBankCount32]>;
@@ -644,7 +675,11 @@ def isCIVI : Predicate <
"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<"FeatureCIInsts">;
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
+ AssemblerPredicate<"FeatureFlatAddressSpace">;
+
+def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
+ AssemblerPredicate<"FeatureFlatGlobalInsts">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
AssemblerPredicate<"Feature16BitInsts">;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5586b513b5fca..96f819fd0e684 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3527,18 +3527,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
//===----------------------------------------------------------------------===//
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const {
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT,
+ const SDLoc &SL,
+ bool RawReg) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned VirtualRegister;
+ unsigned VReg;
+
if (!MRI.isLiveIn(Reg)) {
- VirtualRegister = MRI.createVirtualRegister(RC);
- MRI.addLiveIn(Reg, VirtualRegister);
+ VReg = MRI.createVirtualRegister(RC);
+ MRI.addLiveIn(Reg, VReg);
} else {
- VirtualRegister = MRI.getLiveInVirtReg(Reg);
+ VReg = MRI.getLiveInVirtReg(Reg);
}
- return DAG.getRegister(VirtualRegister, VT);
+
+ if (RawReg)
+ return DAG.getRegister(VReg, VT);
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -3657,6 +3664,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0d066cdbdff4d..a45234e2b39f2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -216,10 +216,25 @@ public:
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
- /// \returns a RegisterSDNode representing Reg.
- virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const;
+ /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
+ /// a copy from the register.
+ SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT,
+ const SDLoc &SL,
+ bool RawReg = false) const;
+ SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
+ }
+
+ // Returns the raw live in register rather than a copy from it.
+ SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
+ }
enum ImplicitParameter {
FIRST_IMPLICIT,
@@ -388,6 +403,8 @@ enum NodeType : unsigned {
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
+ TBUFFER_STORE_FORMAT_X3,
+ TBUFFER_LOAD_FORMAT,
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index a01f5d37c7c16..69dc529861729 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
- VI = 1
+ VI = 1,
+ SDWA = 2,
+ SDWA9 = 3
};
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
@@ -101,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
}
int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
- int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+ SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+ Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+ : SIEncodingFamily::SDWA;
+
+ int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
// -1 means that Opcode is already a native instruction.
if (MCOp == -1)
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e286558ce60d7..bcf89bb78ad66 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -70,6 +70,10 @@ def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
>;
+def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -179,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
// out = (src1 > src0) ? 1 : 0
def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
+// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
+// nodes in TargetSelectionDAG.td.
+def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
+
+def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
+
def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 790a69b843979..cc56216c355bf 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -29,12 +29,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
using namespace TargetOpcode;
const LLT S1= LLT::scalar(1);
+ const LLT V2S16 = LLT::vector(2, 16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
const LLT P1 = LLT::pointer(1, 64);
const LLT P2 = LLT::pointer(2, 64);
setAction({G_ADD, S32}, Legal);
+ setAction({G_AND, S32}, Legal);
+
+ setAction({G_BITCAST, V2S16}, Legal);
+ setAction({G_BITCAST, 1, S32}, Legal);
+
+ setAction({G_BITCAST, S32}, Legal);
+ setAction({G_BITCAST, 1, V2S16}, Legal);
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
@@ -61,6 +69,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_SELECT, S32}, Legal);
setAction({G_SELECT, 1, S1}, Legal);
+ setAction({G_SHL, S32}, Legal);
+
setAction({G_STORE, S32}, Legal);
setAction({G_STORE, 1, P1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 8d157e2f98f24..ab5abf2039a5b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasScalarStores(false),
HasInv2PiInlineImm(false),
HasSDWA(false),
+ HasSDWAOmod(false),
+ HasSDWAScalar(false),
+ HasSDWASdst(false),
+ HasSDWAMac(false),
+ HasSDWAClampVOPC(false),
HasDPP(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5f4f20316a6ba..2b16289c723ef 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -149,6 +149,11 @@ protected:
bool HasScalarStores;
bool HasInv2PiInlineImm;
bool HasSDWA;
+ bool HasSDWAOmod;
+ bool HasSDWAScalar;
+ bool HasSDWASdst;
+ bool HasSDWAMac;
+ bool HasSDWAClampVOPC;
bool HasDPP;
bool FlatAddressSpace;
bool FlatInstOffsets;
@@ -431,6 +436,26 @@ public:
return HasSDWA;
}
+ bool hasSDWAOmod() const {
+ return HasSDWAOmod;
+ }
+
+ bool hasSDWAScalar() const {
+ return HasSDWAScalar;
+ }
+
+ bool hasSDWASdst() const {
+ return HasSDWASdst;
+ }
+
+ bool hasSDWAMac() const {
+ return HasSDWAMac;
+ }
+
+ bool hasSDWAClampVOPC() const {
+ return HasSDWAClampVOPC;
+ }
+
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b644eba536fa4..04fe9f689806c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -342,6 +342,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(createAMDGPUExternalAAWrapperPass());
}
});
+
+ Builder.addExtension(
+ PassManagerBuilder::EP_CGSCCOptimizerLate,
+ [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ // Add infer address spaces pass to the opt pipeline after inlining
+ // but before SROA to increase SROA opportunities.
+ PM.add(createInferAddressSpacesPass());
+ });
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0d6689bd04c4e..88245b01683a5 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -184,9 +184,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
}
}
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
- if (Vec)
- return 0;
+unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+ // The concept of vector registers doesn't really exist. Some packed vector
+ // operations operate on the normal 32-bit registers.
// Number of VGPRs on SI.
if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -195,8 +195,18 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+ // This is really the number of registers to fill when vectorizing /
+ // interleaving loops, so we lie to avoid trying to use all registers.
+ return getHardwareNumberOfRegisters(Vec) >> 3;
+}
+
unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
- return Vector ? 0 : 32;
+ return 32;
+}
+
+unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+ return 32;
}
unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
@@ -247,11 +257,11 @@ bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
+ // TODO: Enable this again.
if (VF == 1)
return 1;
- // Semi-arbitrary large amount.
- return 64;
+ return 8;
}
int AMDGPUTTIImpl::getArithmeticInstrCost(
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a60b1bb1b59c7..485e20411ab49 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -75,8 +75,10 @@ public:
return TTI::PSK_FastHardware;
}
- unsigned getNumberOfRegisters(bool Vector);
- unsigned getRegisterBitWidth(bool Vector) const;
+ unsigned getHardwareNumberOfRegisters(bool Vector) const;
+ unsigned getNumberOfRegisters(bool Vector) const;
+ unsigned getRegisterBitWidth(bool Vector) const ;
+ unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 392e9d89bd9ba..7b8756050b752 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@ public:
ImmTyExpTgt,
ImmTyExpCompr,
ImmTyExpVM,
+ ImmTyDFMT,
+ ImmTyNFMT,
ImmTyHwreg,
ImmTyOff,
ImmTySendMsg,
@@ -260,6 +262,8 @@ public:
return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
+ bool isSDWARegKind() const;
+
bool isImmTy(ImmTy ImmT) const {
return isImm() && Imm.Type == ImmT;
}
@@ -292,6 +296,8 @@ public:
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
+ bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+ bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -636,6 +642,8 @@ public:
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyDFMT: OS << "DFMT"; break;
+ case ImmTyNFMT: OS << "NFMT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
case ImmTyOModSI: OS << "OModSI"; break;
case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -993,7 +1001,9 @@ private:
void errorExpTgt();
OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
- bool validateOperandLimitations(const MCInst &Inst);
+ bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+ bool validateConstantBusLimitations(const MCInst &Inst);
+ bool validateEarlyClobberLimitations(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1029,6 +1039,8 @@ public:
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+ void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
AMDGPUOperand::Ptr defaultGLC() const;
AMDGPUOperand::Ptr defaultSLC() const;
AMDGPUOperand::Ptr defaultTFE() const;
@@ -1042,6 +1054,7 @@ public:
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
AMDGPUOperand::Ptr defaultOffsetU12() const;
+ AMDGPUOperand::Ptr defaultOffsetS13() const;
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
@@ -1243,6 +1256,15 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
+bool AMDGPUOperand::isSDWARegKind() const {
+ if (AsmParser->isVI())
+ return isVReg();
+ else if (AsmParser->isGFX9())
+ return isRegKind();
+ else
+ return false;
+}
+
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
{
assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -2083,7 +2105,7 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
}
-bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
unsigned ConstantBusUseCount = 0;
@@ -2137,6 +2159,60 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
return ConstantBusUseCount <= 1;
}
+bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+
+ const unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+
+ const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+ if (DstIdx == -1 ||
+ Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) {
+ return true;
+ }
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ assert(DstIdx != -1);
+ const MCOperand &Dst = Inst.getOperand(DstIdx);
+ assert(Dst.isReg());
+ const unsigned DstReg = mc2PseudoReg(Dst.getReg());
+
+ const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+ for (int SrcIdx : SrcIndices) {
+ if (SrcIdx == -1) break;
+ const MCOperand &Src = Inst.getOperand(SrcIdx);
+ if (Src.isReg()) {
+ const unsigned SrcReg = mc2PseudoReg(Src.getReg());
+ if (isRegIntersect(DstReg, SrcReg, TRI)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
+ const SMLoc &IDLoc) {
+ if (!validateConstantBusLimitations(Inst)) {
+ Error(IDLoc,
+ "invalid operand (violates constant bus restrictions)");
+ return false;
+ }
+ if (!validateEarlyClobberLimitations(Inst)) {
+ Error(IDLoc,
+ "destination must be different than all sources");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
@@ -2169,9 +2245,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
switch (Result) {
default: break;
case Match_Success:
- if (!validateOperandLimitations(Inst)) {
- return Error(IDLoc,
- "invalid operand (violates constant bus restrictions)");
+ if (!validateInstruction(Inst, IDLoc)) {
+ return true;
}
Inst.setLoc(IDLoc);
Out.EmitInstruction(Inst, getSTI());
@@ -2554,11 +2629,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
return MatchOperand_ParseFail;
Parser.Lex();
+
+ bool IsMinus = false;
+ if (getLexer().getKind() == AsmToken::Minus) {
+ Parser.Lex();
+ IsMinus = true;
+ }
+
if (getLexer().isNot(AsmToken::Integer))
return MatchOperand_ParseFail;
if (getParser().parseAbsoluteExpression(Int))
return MatchOperand_ParseFail;
+
+ if (IsMinus)
+ Int = -Int;
break;
}
}
@@ -3743,6 +3828,44 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle the case where soffset is an immediate
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle tokens like 'offen' which are sometimes hard-coded into the
+ // asm string. There are no MCInst operands for these.
+ if (Op.isToken()) {
+ continue;
+ }
+ assert(Op.isImm());
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOffset);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
//===----------------------------------------------------------------------===//
// mimg
//===----------------------------------------------------------------------===//
@@ -3870,6 +3993,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
//===----------------------------------------------------------------------===//
// vop3
//===----------------------------------------------------------------------===//
@@ -3919,6 +4046,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
{"gds", AMDGPUOperand::ImmTyGDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
+ {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
+ {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4475,12 +4604,11 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
- // V_NOP_sdwa_vi has no optional sdwa arguments
+ // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
switch (BasicInstType) {
case SIInstrFlags::VOP1:
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- if (isGFX9() &&
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4490,8 +4618,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
case SIInstrFlags::VOP2:
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- if (isGFX9() &&
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4501,9 +4628,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
break;
case SIInstrFlags::VOPC:
- if (isVI()) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- }
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
break;
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2aca65ac84303..2e96c14eaa320 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -57,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
string OpName = NAME # suffix;
}
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+ bit IsAddr64 = is_addr64;
+ string OpName = NAME # suffix;
+}
+
//===----------------------------------------------------------------------===//
// MTBUF classes
//===----------------------------------------------------------------------===//
@@ -78,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
let EXP_CNT = 1;
let MTBUF = 1;
let Uses = [EXEC];
-
let hasSideEffects = 0;
let SchedRW = [WriteVMEM];
+
+ let AsmMatchConverter = "cvtMtbuf";
+
+ bits<1> offen = 0;
+ bits<1> idxen = 0;
+ bits<1> addr64 = 0;
+ bits<1> has_vdata = 1;
+ bits<1> has_vaddr = 1;
+ bits<1> has_glc = 1;
+ bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<4> dfmt_value = 1; // the value for dfmt if no such operand
+ bits<3> nfmt_value = 0; // the value for nfmt if no such operand
+ bits<1> has_srsrc = 1;
+ bits<1> has_soffset = 1;
+ bits<1> has_offset = 1;
+ bits<1> has_slc = 1;
+ bits<1> has_tfe = 1;
+ bits<1> has_dfmt = 1;
+ bits<1> has_nfmt = 1;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
- Enc64 {
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -97,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
- bits<8> vdata;
bits<12> offset;
- bits<1> offen;
- bits<1> idxen;
- bits<1> glc;
- bits<1> addr64;
- bits<4> dfmt;
- bits<3> nfmt;
- bits<8> vaddr;
- bits<7> srsrc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-
- let Inst{11-0} = offset;
- let Inst{12} = offen;
- let Inst{13} = idxen;
- let Inst{14} = glc;
- let Inst{22-19} = dfmt;
- let Inst{25-23} = nfmt;
- let Inst{31-26} = 0x3a; //encoding
- let Inst{39-32} = vaddr;
- let Inst{47-40} = vdata;
- let Inst{52-48} = srsrc{6-2};
- let Inst{54} = slc;
- let Inst{55} = tfe;
- let Inst{63-56} = soffset;
+ bits<1> glc;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+ list<RegisterClass> vaddrList=[]> {
+ RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ dag InsNoData = !if(!empty(vaddrList),
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+ (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+ );
+ dag InsData = !if(!empty(vaddrList),
+ (ins vdataClass:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+ slc:$slc, tfe:$tfe),
+ (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+ slc:$slc, tfe:$tfe)
+ );
+ dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
- opName, (outs regClass:$dst),
- (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
- i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
- " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
- " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+ dag ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+ (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+ string Pfx =
+ !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+ !if(!eq(addrKind, BUFAddrKind.OffEn),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+ !if(!eq(addrKind, BUFAddrKind.IdxEn),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+ !if(!eq(addrKind, BUFAddrKind.BothEn),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+ !if(!eq(addrKind, BUFAddrKind.Addr64),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+ "")))));
+ string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+ bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+ bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind>
+ : MTBUF_Pseudo<opName,
+ (outs vdataClass:$vdata),
+ getMTBUFIns<addrKindCopy>.ret,
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MTBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 1;
let mayStore = 0;
}
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
- opName, (outs),
- (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
- i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
- SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
- " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
- " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> {
+
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+ i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MTBUFAddr64Table<0>;
+
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+ i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MTBUFAddr64Table<1>;
+
+ def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MTBUF_Pseudo<opName,
+ (outs),
+ getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MTBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
}
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+ ValueType store_vt = i32,
+ SDPatternOperator st = null_frag> {
+
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+ i1:$slc, i1:$tfe))]>,
+ MTBUFAddr64Table<0>;
+
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+ i1:$slc, i1:$tfe))]>,
+ MTBUFAddr64Table<1>;
+
+ def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+
//===----------------------------------------------------------------------===//
// MUBUF classes
//===----------------------------------------------------------------------===//
@@ -676,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
// MTBUF Instructions
//===----------------------------------------------------------------------===//
-//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
} // End let SubtargetPredicate = isGCN
@@ -1093,22 +1226,98 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF
// MTBUF Patterns
//===----------------------------------------------------------------------===//
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
- (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
- i32:$soffset, imm:$inst_offset, imm:$dfmt,
- imm:$nfmt, imm:$offen, imm:$idxen,
- imm:$glc, imm:$slc, imm:$tfe),
- (opcode
- $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
- (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
- (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i8imm $dfmt),
+ (as_i8imm $nfmt), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i8imm $dfmt),
+ (as_i8imm $nfmt), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i8imm $dfmt),
+ (as_i8imm $nfmt), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+ imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+ $vdata,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
} // End let Predicates = [isGCN]
@@ -1224,21 +1433,44 @@ def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
MTBUF_Real<ps>,
+ Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
let AssemblerPredicate=isSICI;
let DecoderNamespace="SICI";
- bits<1> addr64;
- let Inst{15} = addr64;
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{15} = ps.addr64;
let Inst{18-16} = op;
+ let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+ let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+ def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
//===----------------------------------------------------------------------===//
// CI
@@ -1350,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
MTBUF_Real<ps>,
+ Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
let AssemblerPredicate=isVI;
let DecoderNamespace="VI";
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{18-15} = op;
+ let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+ let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+ def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 88c92b9582fd0..04308fb3aaf64 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) {
MCDisassembler::SoftFail;
}
+static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
+ uint16_t NameIdx) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
+ if (OpIdx != -1) {
+ auto I = MI.begin();
+ std::advance(I, OpIdx);
+ MI.insert(I, Op);
+ }
+ return OpIdx;
+}
+
static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
uint64_t Addr, const void *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -106,12 +117,12 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
-#define DECODE_SDWA9(DecName) \
-DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)
+#define DECODE_SDWA(DecName) \
+DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
-DECODE_SDWA9(Src32)
-DECODE_SDWA9(Src16)
-DECODE_SDWA9(VopcDst)
+DECODE_SDWA(Src32)
+DECODE_SDWA(Src16)
+DECODE_SDWA(VopcDst)
#include "AMDGPUGenDisassemblerTables.inc"
@@ -149,6 +160,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
raw_ostream &WS,
raw_ostream &CS) const {
CommentStream = &CS;
+ bool IsSDWA = false;
// ToDo: AMDGPUDisassembler supports only VI ISA.
if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
@@ -170,10 +182,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res) break;
Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
- if (Res) break;
+ if (Res) { IsSDWA = true; break; }
Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
- if (Res) break;
+ if (Res) { IsSDWA = true; break; }
}
// Reinitialize Bytes as DPP64 could have eaten too much
@@ -200,17 +212,36 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
// Insert dummy unused src2_modifiers.
- int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::src2_modifiers);
- auto I = MI.begin();
- std::advance(I, Src2ModIdx);
- MI.insert(I, MCOperand::createImm(0));
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src2_modifiers);
}
+ if (Res && IsSDWA)
+ Res = convertSDWAInst(MI);
+
Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
return Res;
}
+DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+ if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
+ // VOPC - insert clamp
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
+ } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+ int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
+ if (SDst != -1) {
+ // VOPC - insert VCC register as sdst
+ insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC),
+ AMDGPU::OpName::sdst);
+ } else {
+ // VOP1/2 - insert omod if present in instruction
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
+ }
+ }
+ return MCDisassembler::Success;
+}
+
const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
return getContext().getRegisterInfo()->
getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -524,8 +555,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
}
- assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
-
if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
return decodeIntImmed(Val);
@@ -592,36 +621,43 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
return errOperand(Val, "unknown operand encoding " + Twine(Val));
}
-MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,
- unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
+ unsigned Val) const {
using namespace AMDGPU::SDWA;
- if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
- Val <= SDWA9EncValues::SRC_VGPR_MAX) {
- return createRegOperand(getVgprClassId(Width),
- Val - SDWA9EncValues::SRC_VGPR_MIN);
- }
- if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
- Val <= SDWA9EncValues::SRC_SGPR_MAX) {
- return createSRegOperand(getSgprClassId(Width),
- Val - SDWA9EncValues::SRC_SGPR_MIN);
- }
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+ if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+ return createRegOperand(getVgprClassId(Width),
+ Val - SDWA9EncValues::SRC_VGPR_MIN);
+ }
+ if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+ return createSRegOperand(getSgprClassId(Width),
+ Val - SDWA9EncValues::SRC_SGPR_MIN);
+ }
- return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+ return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+ } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+ return createRegOperand(getVgprClassId(Width), Val);
+ }
+ llvm_unreachable("unsupported target");
}
-MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {
- return decodeSDWA9Src(OPW16, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
+ return decodeSDWASrc(OPW16, Val);
}
-MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {
- return decodeSDWA9Src(OPW32, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
+ return decodeSDWASrc(OPW32, Val);
}
-MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
using namespace AMDGPU::SDWA;
+ assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
+ "SDWAVopcDst should be present only on GFX9");
if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
if (Val > AMDGPU::EncValues::SGPR_MAX) {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5fa3cf1a223fa..3d71db909e20d 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -65,6 +65,8 @@ public:
uint64_t Inst,
uint64_t Address) const;
+ DecodeStatus convertSDWAInst(MCInst &MI) const;
+
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
MCOperand decodeOperand_VS_32(unsigned Val) const;
MCOperand decodeOperand_VS_64(unsigned Val) const;
@@ -105,10 +107,10 @@ public:
MCOperand decodeSpecialReg32(unsigned Val) const;
MCOperand decodeSpecialReg64(unsigned Val) const;
- MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;
- MCOperand decodeSDWA9Src16(unsigned Val) const;
- MCOperand decodeSDWA9Src32(unsigned Val) const;
- MCOperand decodeSDWA9VopcDst(unsigned Val) const;
+ MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeSDWASrc16(unsigned Val) const;
+ MCOperand decodeSDWASrc32(unsigned Val) const;
+ MCOperand decodeSDWAVopcDst(unsigned Val) const;
};
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 98eda288bcacb..edca6fcd812c8 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -31,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
let VM_CNT = 1;
let LGKM_CNT = 1;
- let Uses = [EXEC, FLAT_SCR]; // M0
-
let UseNamedOperandTable = 1;
let hasSideEffects = 0;
let SchedRW = [WriteVMEM];
@@ -40,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
string Mnemonic = opName;
string AsmOperands = asmOps;
+ bits<1> is_flat_global = 0;
+ bits<1> is_flat_scratch = 0;
+
bits<1> has_vdst = 1;
bits<1> has_data = 1;
bits<1> has_glc = 1;
bits<1> glcValue = 0;
+
+ // TODO: M0 if it could possibly access LDS (before gfx9? only)?
+ let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -68,7 +72,10 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// Only valid on gfx9
bits<1> lds = 0; // XXX - What does this actually do?
- bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+
+ // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+ bits<2> seg = !if(ps.is_flat_global, 0b10,
+ !if(ps.is_flat_scratch, 0b01, 0));
// Signed offset. Highest bit ignored for flat and treated as 12-bit
// unsigned for flat acceses.
@@ -81,7 +88,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// Only valid on GFX9+
let Inst{12-0} = offset;
let Inst{13} = lds;
- let Inst{15-14} = 0;
+ let Inst{15-14} = seg;
let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
let Inst{17} = slc;
@@ -106,6 +113,16 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let mayLoad = 1;
}
+class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Load_Pseudo<opName, regClass, 1> {
+ let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Load_Pseudo<opName, regClass, 1> {
+ let is_flat_scratch = 1;
+}
+
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
bit HasSignedOffset = 0> : FLAT_Pseudo<
opName,
@@ -119,6 +136,16 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let has_vdst = 0;
}
+class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Store_Pseudo<opName, regClass, 1> {
+ let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Store_Pseudo<opName, regClass, 1> {
+ let is_flat_scratch = 1;
+}
+
multiclass FLAT_Atomic_Pseudo<
string opName,
RegisterClass vdst_rc,
@@ -306,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isCI
+let SubtargetPredicate = HasFlatGlobalInsts in {
+def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+} // End SubtargetPredicate = HasFlatGlobalInsts
+
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -557,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
+def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
+def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
+def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
+def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
+def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
+def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
+def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
+
+def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
+def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
+def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
+def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
+def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
+def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index b84640230eeeb..7c31c8e397ba7 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
}
+void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+}
+
void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -118,6 +123,16 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm != 0) {
+ O << ((OpNo == 0)? "offset:" : " offset:");
+ printS16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
O << " vm";
}
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " dfmt:";
+ printU8ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " nfmt:";
+ printU8ImmDecOperand(MI, OpNo, O);
+ }
+}
+
void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
const MCRegisterInfo &MRI) {
switch (RegNo) {
@@ -379,7 +412,6 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
- assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
printImmediate16(Lo16, STI, O);
}
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index c8094c4b840a1..7bbf99a85f409 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -42,6 +42,7 @@ private:
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -52,6 +53,9 @@ private:
void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+
void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -84,6 +88,10 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpVM(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printNFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 0a9c2b94c1eee..2b408ff10caae 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -30,14 +30,9 @@ public:
unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
- void processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override;
-
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
@@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
-void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) {
- MCValue Res;
-
- // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
- // used for long branches, this function will be called with
- // IsResolved = false and Value set to some pre-computed value. In
- // the example above, the value would be:
- // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
- // This is not what we want. We just want the expression computation
- // only. The reason the MC layer subtracts the current offset from the
- // expression is because the fixup is of kind FK_PCRel_4.
- // For these scenarios, evaluateAsValue gives us the computation that we
- // want.
- if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
- Res.isAbsolute()) {
- Value = Res.getConstant();
- IsResolved = true;
-
- }
- if (IsResolved)
- Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
-void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
+ Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
if (!Value)
return; // Doesn't change encoding.
@@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
uint32_t Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the bits from
// the fixup value.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index a856b17a228f0..1b062064ace1c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,15 +52,15 @@ public:
return 0;
}
- virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+ virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
return 0;
}
- virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+ virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
return 0;
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e02acf516c0db..376c9bfe5ccf2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -70,13 +70,13 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
- unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
- unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
};
} // end anonymous namespace
@@ -252,9 +252,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
- assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
uint32_t Encoding = getLit16Encoding(Lo16, STI);
- assert(Encoding != 255 && "packed constants can only be inline immediates");
return Encoding;
}
default:
@@ -328,11 +326,11 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
}
unsigned
-SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
using namespace AMDGPU::SDWA;
-
+
uint64_t RegEnc = 0;
const MCOperand &MO = MI.getOperand(OpNo);
@@ -347,9 +345,9 @@ SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
}
unsigned
-SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
using namespace AMDGPU::SDWA;
uint64_t RegEnc = 0;
@@ -365,6 +363,25 @@ SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
return RegEnc;
}
+static bool needsPCRel(const MCExpr *Expr) {
+ switch (Expr->getKind()) {
+ case MCExpr::SymbolRef:
+ return true;
+ case MCExpr::Binary: {
+ auto *BE = cast<MCBinaryExpr>(Expr);
+ if (BE->getOpcode() == MCBinaryExpr::Sub)
+ return false;
+ return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
+ }
+ case MCExpr::Unary:
+ return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
+ case MCExpr::Target:
+ case MCExpr::Constant:
+ return false;
+ }
+ llvm_unreachable("invalid kind");
+}
+
uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
@@ -373,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
return MRI.getEncodingValue(MO.getReg());
if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
- const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ // FIXME: If this is expression is PCRel or not should not depend on what
+ // the expression looks like. Given that this is just a general expression,
+ // it should probably be FK_Data_4 and whatever is producing
+ //
+ // s_add_u32 s2, s2, (extern_const_addrspace+16
+ //
+ // And expecting a PCRel should instead produce
+ //
+ // .Ltmp1:
+ // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
MCFixupKind Kind;
- if (Expr && Expr->getSymbol().isExternal())
- Kind = FK_Data_4;
- else
+ if (needsPCRel(MO.getExpr()))
Kind = FK_PCRel_4;
+ else
+ Kind = FK_Data_4;
Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
}
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index f6f2582aa11b3..d30d1d382588c 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -80,7 +80,7 @@ def : Proc<"cayman", R600_VLIW4_Itin,
// Southern Islands
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"gfx600", SIFullSpeedModel,
+def : ProcessorModel<"gfx600", SIFullSpeedModel,
[FeatureISAVersion6_0_0]>;
def : ProcessorModel<"SI", SIFullSpeedModel,
@@ -95,7 +95,7 @@ def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
[FeatureISAVersion6_0_1]
>;
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
[FeatureISAVersion6_0_1]>;
def : ProcessorModel<"verde", SIQuarterSpeedModel,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index c55878f8bff0f..215791f4f92dd 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -584,23 +584,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return LowerImplicitParameter(DAG, VT, DL, 8);
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_X, VT);
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_Y, VT);
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_Z, VT);
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_X, VT);
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_Y, VT);
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_Z, VT);
case Intrinsic::r600_recipsqrt_ieee:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 5cd90323ff67b..3915c0e5bdbed 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -118,9 +118,9 @@ namespace AMDGPU {
// Operand for source modifiers for VOP instructions
OPERAND_INPUT_MODS,
- // Operand for GFX9 SDWA instructions
- OPERAND_SDWA9_SRC,
- OPERAND_SDWA9_VOPC_DST,
+ // Operand for SDWA instructions
+ OPERAND_SDWA_SRC,
+ OPERAND_SDWA_VOPC_DST,
/// Operand with 32-bit immediate that uses the constant bus.
OPERAND_KIMM32,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5f5f25103c027..0a795c99f94e5 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -174,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
}
+static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII) {
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ auto &Src = MI.getOperand(1);
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = Src.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ !TargetRegisterInfo::isVirtualRegister(DstReg))
+ return false;
+
+ for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
+ const auto *UseMI = MO.getParent();
+ if (UseMI == &MI)
+ continue;
+ if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
+ UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
+ !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+ return false;
+ }
+ // Change VGPR to SGPR destination.
+ MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+ return true;
+}
+
// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
//
// SGPRx = ...
@@ -214,6 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
return false;
+ if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
+ return true;
+
// TODO: Could have multiple extracts?
unsigned SubReg = CopyUse.getOperand(1).getSubReg();
if (SubReg != AMDGPU::NoSubRegister)
@@ -563,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
TII->moveToVALU(MI);
+ } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+ tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}
break;
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index e10f1ed3762e8..f391f67a241f1 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -166,6 +167,8 @@ static bool updateOperand(FoldCandidate &Fold,
if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
TargetRegisterInfo::isVirtualRegister(New->getReg())) {
Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+
+ Old.setIsUndef(New->isUndef());
return true;
}
@@ -470,7 +473,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
- if (Def->isMoveImmediate()) {
+ if (Def && Def->isMoveImmediate()) {
MachineOperand &ImmSrc = Def->getOperand(1);
if (ImmSrc.isImm())
return &ImmSrc;
@@ -921,12 +924,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// level.
bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
-
- MachineBasicBlock &MBB = *BI;
+ for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ for (I = MBB->begin(); I != MBB->end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index b1bd14e421f02..08a64de385018 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+ if (ST.isAmdCodeObjectV2(MF)) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
@@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
- if (MFI->hasPrivateMemoryInputPtr()) {
+ if (MFI->hasImplicitBufferPtr()) {
unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
BuildMI(MBB, I, DL, Mov64, Rsrc01)
- .addReg(PreloadedPrivateBufferReg)
+ .addReg(MFI->getImplicitBufferPtrUserSGPR())
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else {
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
@@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineMemOperand::MODereferenceable,
0, 0);
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
- .addReg(PreloadedPrivateBufferReg)
+ .addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
.addImm(0) // glc
.addMemOperand(MMO)
@@ -417,14 +417,69 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (MFI->isEntryFunction())
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (FuncInfo->isEntryFunction()) {
emitEntryFunctionPrologue(MF, MBB);
+ return;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ DebugLoc DL;
+
+ bool NeedFP = hasFP(MF);
+ if (NeedFP) {
+ // If we need a base pointer, set it up here. It's whatever the value of
+ // the stack pointer is at this point. Any variable size objects will be
+ // allocated after this, so we can still use the base pointer to reference
+ // locals.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+ .addReg(StackPtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ uint32_t NumBytes = MFI.getStackSize();
+ if (NumBytes != 0 && hasSP(MF)) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(NumBytes * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (FuncInfo->isEntryFunction())
+ return;
+ unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ if (StackPtrReg == AMDGPU::NoRegister)
+ return;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint32_t NumBytes = MFI.getStackSize();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc DL;
+
+ // FIXME: Clarify distinction between no set SP and SP. For callee functions,
+ // it's really whether we need SP to be accurate or not.
+
+ if (NumBytes != 0 && hasSP(MF)) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(NumBytes * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
}
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
@@ -557,3 +612,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
}
}
+
+bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
+ // All stack operations are relative to the frame offset SGPR.
+ // TODO: Still want to eliminate sometimes.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // XXX - Is this only called after frame is finalized? Should be able to check
+ // frame size.
+ return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
+}
+
+bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+ // All stack operations are relative to the frame offset SGPR.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MFI.hasCalls() || MFI.hasVarSizedObjects();
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index e17adbe273614..d4dfa1c7eaa86 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -60,6 +60,10 @@ private:
/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+public:
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasSP(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 441f1ef4bd04c..d0f4e00994de1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -211,6 +211,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UADDO, MVT::i32, Legal);
setOperationAction(ISD::USUBO, MVT::i32, Legal);
+ setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
+ setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
@@ -471,6 +474,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
}
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::ADDCARRY);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::SUBCARRY);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
@@ -1061,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
- if (Info.hasPrivateMemoryInputPtr()) {
- unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
- MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
- CCInfo.AllocateReg(PrivateMemoryPtrReg);
+ if (Info.hasImplicitBufferPtr()) {
+ unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+ MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -1227,7 +1234,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
}
}
- if (NeedSP){
+ if (NeedSP) {
unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
@@ -2998,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
- unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ if (getSubtarget()->isAmdCodeObjectV2(MF))
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ unsigned Reg = TRI->getPreloadedValue(MF,
+ SIRegisterInfo::IMPLICIT_BUFFER_PTR);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
}
case Intrinsic::amdgcn_dispatch_ptr:
@@ -3288,6 +3299,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec: {
@@ -3313,7 +3326,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // glc
Op.getOperand(6) // slc
};
- MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3328,6 +3340,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
}
+ case Intrinsic::amdgcn_tbuffer_load: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // voffset
+ Op.getOperand(5), // soffset
+ Op.getOperand(6), // offset
+ Op.getOperand(7), // dfmt
+ Op.getOperand(8), // nfmt
+ Op.getOperand(9), // glc
+ Op.getOperand(10) // slc
+ };
+
+ EVT VT = Op.getOperand(2).getValueType();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad,
+ VT.getStoreSize(), VT.getStoreSize());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
// Basic sample.
case Intrinsic::amdgcn_image_sample:
case Intrinsic::amdgcn_image_sample_cl:
@@ -3393,10 +3428,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ MachineFunction &MF = DAG.getMachineFunction();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp: {
@@ -3463,33 +3498,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
Op.getOperand(2), Op.getOperand(3));
}
- case AMDGPUIntrinsic::SI_tbuffer_store: {
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2),
- Op.getOperand(3),
- Op.getOperand(4),
- Op.getOperand(5),
- Op.getOperand(6),
- Op.getOperand(7),
- Op.getOperand(8),
- Op.getOperand(9),
- Op.getOperand(10),
- Op.getOperand(11),
- Op.getOperand(12),
- Op.getOperand(13),
- Op.getOperand(14)
- };
-
- EVT VT = Op.getOperand(3).getValueType();
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
- }
case AMDGPUIntrinsic::AMDGPU_kill: {
SDValue Src = Op.getOperand(2);
if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3505,7 +3513,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const MachineFunction &MF = DAG.getMachineFunction();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
@@ -3514,6 +3521,75 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
return SDValue();
};
+ case AMDGPUIntrinsic::SI_tbuffer_store: {
+
+ // Extract vindex and voffset from vaddr as appropriate
+ const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+ const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+ SDValue VAddr = Op.getOperand(5);
+
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+ assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+ "Legacy intrinsic doesn't support both offset and index - use new version");
+
+ SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+ SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+ // Deal with the vec-3 case
+ const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+ auto Opcode = NumChannels->getZExtValue() == 3 ?
+ AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(3), // vdata
+ Op.getOperand(2), // rsrc
+ VIndex,
+ VOffset,
+ Op.getOperand(6), // soffset
+ Op.getOperand(7), // inst_offset
+ Op.getOperand(8), // dfmt
+ Op.getOperand(9), // nfmt
+ Op.getOperand(12), // glc
+ Op.getOperand(13), // slc
+ };
+
+ assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
+ "Value of tfe other than zero is unsupported");
+
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(Opcode, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+
+ case Intrinsic::amdgcn_tbuffer_store: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // voffset
+ Op.getOperand(6), // soffset
+ Op.getOperand(7), // offset
+ Op.getOperand(8), // dfmt
+ Op.getOperand(9), // nfmt
+ Op.getOperand(10), // glc
+ Op.getOperand(11) // slc
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+
default:
return Op;
}
@@ -4839,6 +4915,103 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // add x, zext (setcc) => addcarry x, 0, setcc
+ // add x, sext (setcc) => subcarry x, 0, setcc
+ unsigned Opc = LHS.getOpcode();
+ if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
+ Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
+ std::swap(RHS, LHS);
+
+ Opc = RHS.getOpcode();
+ switch (Opc) {
+ default: break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ANY_EXTEND: {
+ auto Cond = RHS.getOperand(0);
+ if (Cond.getOpcode() != ISD::SETCC &&
+ Cond.getOpcode() != AMDGPUISD::FP_CLASS)
+ break;
+ SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
+ SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
+ Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
+ return DAG.getNode(Opc, SL, VTList, Args);
+ }
+ case ISD::ADDCARRY: {
+ // add x, (addcarry y, 0, cc) => addcarry x, y, cc
+ auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!C || C->getZExtValue() != 0) break;
+ SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
+ return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
+ }
+ }
+ return SDValue();
+}
+
+SDValue SITargetLowering::performSubCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ unsigned Opc = LHS.getOpcode();
+ if (Opc != ISD::SUBCARRY)
+ std::swap(RHS, LHS);
+
+ if (LHS.getOpcode() == ISD::SUBCARRY) {
+ // sub (subcarry x, 0, cc), y => subcarry x, y, cc
+ auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ if (!C || C->getZExtValue() != 0)
+ return SDValue();
+ SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
+ return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
+ }
+ return SDValue();
+}
+
+SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ if (N->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C || C->getZExtValue() != 0)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+
+ // addcarry (add x, y), 0, cc => addcarry x, y, cc
+ // subcarry (sub x, y), 0, cc => subcarry x, y, cc
+ unsigned LHSOpc = LHS.getOpcode();
+ unsigned Opc = N->getOpcode();
+ if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
+ (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
+ SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
+ return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performFAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -5009,6 +5182,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ case ISD::ADD:
+ return performAddCombine(N, DCI);
+ case ISD::SUB:
+ return performSubCombine(N, DCI);
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY:
+ return performAddCarrySubCarryCombine(N, DCI);
case ISD::FADD:
return performFAddCombine(N, DCI);
case ISD::FSUB:
@@ -5425,15 +5605,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
}
-SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const {
- SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
-
- return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
- cast<RegisterSDNode>(VReg)->getReg(), VT);
-}
-
//===----------------------------------------------------------------------===//
// SI Inline Assembly Support
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 8e2ec40b224cd..24f88e632d38e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -108,6 +108,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
+ SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -216,8 +219,6 @@ public:
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
- SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const override;
SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1097814e99ce2..c9b48fea7225e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2108,7 +2108,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
uint8_t OperandType) const {
- if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+ if (!MO.isImm() ||
+ OperandType < AMDGPU::OPERAND_SRC_FIRST ||
+ OperandType > AMDGPU::OPERAND_SRC_LAST)
return false;
// MachineOperand provides no way to tell the true operand size, since it only
@@ -2433,8 +2435,73 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // Verify SDWA
+ if (isSDWA(MI)) {
+
+ if (!ST.hasSDWA()) {
+ ErrInfo = "SDWA is not supported on this target";
+ return false;
+ }
+
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+ if ( DstIdx == -1)
+ DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst);
+
+ const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
+
+ for (int OpIdx: OpIndicies) {
+ if (OpIdx == -1)
+ continue;
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+
+ if (!ST.hasSDWAScalar()) {
+ // Only VGPRS on VI
+ if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
+ ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
+ return false;
+ }
+ } else {
+ // No immediates on GFX9
+ if (!MO.isReg()) {
+ ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+ return false;
+ }
+ }
+ }
+
+ if (!ST.hasSDWAOmod()) {
+ // No omod allowed on VI
+ const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+ if (OMod != nullptr &&
+ (!OMod->isImm() || OMod->getImm() != 0)) {
+ ErrInfo = "OMod not allowed in SDWA instructions on VI";
+ return false;
+ }
+ }
+
+ uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+ if (isVOPC(BasicOpcode)) {
+ if (!ST.hasSDWASdst() && DstIdx != -1) {
+ // Only vcc allowed as dst on VI for VOPC
+ const MachineOperand &Dst = MI.getOperand(DstIdx);
+ if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
+ ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
+ return false;
+ }
+ } else if (!ST.hasSDWAClampVOPC()) {
+ // No clamp allowed on GFX9 for VOPC
+ const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+ if (Clamp != nullptr &&
+ (!Clamp->isImm() || Clamp->getImm() != 0)) {
+ ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
+ return false;
+ }
+ }
+ }
+ }
+
// Verify VOP*
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+ if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index f6e5e8883f63c..74b48c7618087 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -814,6 +814,9 @@ namespace AMDGPU {
int getSDWAOp(uint16_t Opcode);
LLVM_READONLY
+ int getBasicFromSDWAOp(uint16_t Opcode);
+
+ LLVM_READONLY
int getCommuteRev(uint16_t Opcode);
LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 470a47b024433..3b4a8b5d1e817 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -20,6 +20,8 @@ def SIEncodingFamily {
int NONE = -1;
int SI = 0;
int VI = 1;
+ int SDWA = 2;
+ int SDWA9 = 3;
}
//===----------------------------------------------------------------------===//
@@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
- SDTypeProfile<0, 13,
- [SDTCisVT<0, v4i32>, // rsrc(SGPR)
- SDTCisVT<1, iAny>, // vdata(VGPR)
- SDTCisVT<2, i32>, // num_channels(imm)
- SDTCisVT<3, i32>, // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+ SDTypeProfile<1, 9,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
- SDTCisVT<5, i32>, // inst_offset(imm)
+ SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // dfmt(imm)
SDTCisVT<7, i32>, // nfmt(imm)
- SDTCisVT<8, i32>, // offen(imm)
- SDTCisVT<9, i32>, // idxen(imm)
- SDTCisVT<10, i32>, // glc(imm)
- SDTCisVT<11, i32>, // slc(imm)
- SDTCisVT<12, i32> // tfe(imm)
+ SDTCisVT<8, i32>, // glc(imm)
+ SDTCisVT<9, i32> // slc(imm)
]>,
- [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // dfmt(imm)
+ SDTCisVT<7, i32>, // nfmt(imm)
+ SDTCisVT<8, i32>, // glc(imm)
+ SDTCisVT<9, i32> // slc(imm)
+ ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+ SDTtbuffer_store,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
def SDTBufferLoad : SDTypeProfile<1, 5,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -452,25 +470,25 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
let ParserMatchClass = VReg32OrOffClass;
}
-class SDWA9Src : RegisterOperand<VS_32> {
+class SDWASrc : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_SDWA9_SRC";
- let EncoderMethod = "getSDWA9SrcEncoding";
+ let OperandType = "OPERAND_SDWA_SRC";
+ let EncoderMethod = "getSDWASrcEncoding";
}
-def SDWA9Src32 : SDWA9Src {
- let DecoderMethod = "decodeSDWA9Src32";
+def SDWASrc32 : SDWASrc {
+ let DecoderMethod = "decodeSDWASrc32";
}
-def SDWA9Src16 : SDWA9Src {
- let DecoderMethod = "decodeSDWA9Src16";
+def SDWASrc16 : SDWASrc {
+ let DecoderMethod = "decodeSDWASrc16";
}
-def SDWA9VopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : VOPDstOperand<SReg_64> {
let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_SDWA9_VOPC_DST";
- let EncoderMethod = "getSDWA9VopcDstEncoding";
- let DecoderMethod = "decodeSDWA9VopcDst";
+ let OperandType = "OPERAND_SDWA_VOPC_DST";
+ let EncoderMethod = "getSDWAVopcDstEncoding";
+ let DecoderMethod = "decodeSDWAVopcDst";
}
class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@@ -525,7 +543,7 @@ def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>;
+def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -545,6 +563,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
@@ -634,13 +655,13 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
-def FPRegInputModsMatchClass : AsmOperandClass {
- let Name = "RegWithFPInputMods";
+def FPRegSDWAInputModsMatchClass : AsmOperandClass {
+ let Name = "SDWARegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isRegKind";
+ let PredicateMethod = "isSDWARegKind";
}
-def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {
+def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
@@ -655,13 +676,13 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
}
-def IntRegInputModsMatchClass : AsmOperandClass {
- let Name = "RegWithIntInputMods";
+def IntRegSDWAInputModsMatchClass : AsmOperandClass {
+ let Name = "SDWARegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isRegKind";
+ let PredicateMethod = "isSDWARegKind";
}
-def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {
+def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
let PrintMethod = "printOperandAndIntInputMods";
}
@@ -851,10 +872,10 @@ class getVALUDstForVT<ValueType VT> {
}
// Returns the register class to use for the destination of VOP[12C]
-// instructions with GFX9 SDWA extension
-class getSDWA9DstForVT<ValueType VT> {
+// instructions with SDWA extension
+class getSDWADstForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 1),
- SDWA9VopcDst, // VOPC
+ SDWAVopcDst, // VOPC
VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
}
@@ -898,8 +919,8 @@ class getVregSrcForVT<ValueType VT> {
!if(!eq(VT.Size, 64), VReg_64, VGPR_32));
}
-class getSDWA9SrcForVT <ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);
+class getSDWASrcForVT <ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
}
// Returns the register class to use for sources of VOP3 instructions for the
@@ -995,7 +1016,7 @@ class getSrcMod <ValueType VT> {
);
}
-// Return type of input modifiers operand specified input operand for SDWA/DPP
+// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
bit isFP = !if(!eq(VT.Value, f16.Value), 1,
!if(!eq(VT.Value, f32.Value), 1,
@@ -1004,13 +1025,13 @@ class getSrcModExt <ValueType VT> {
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
-// Return type of input modifiers operand specified input operand for SDWA 9
-class getSrcModSDWA9 <ValueType VT> {
+// Return type of input modifiers operand specified input operand for SDWA
+class getSrcModSDWA <ValueType VT> {
bit isFP = !if(!eq(VT.Value, f16.Value), 1,
!if(!eq(VT.Value, f32.Value), 1,
!if(!eq(VT.Value, f64.Value), 1,
0)));
- Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);
+ Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1141,36 +1162,12 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
/* endif */)));
}
-class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
- bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
- ValueType DstVT> {
- dag ret = !if(!eq(NumSrcArgs, 0),
- // VOP1 without input operands (V_NOP)
- (ins),
- !if(!eq(NumSrcArgs, 1),
- // VOP1_SDWA
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel),
- !if(!eq(NumSrcArgs, 2),
- !if(!eq(DstVT.Size, 1),
- // VOPC_SDWA with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel)),
- (ins)/* endif */)));
-}
-// Ins for GFX9 SDWA
-class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
- bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
- ValueType DstVT> {
+// Ins for SDWA
+class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+ bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
+ ValueType DstVT> {
dag ret = !if(!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
@@ -1178,31 +1175,31 @@ class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArg
!if(!eq(NumSrcArgs, 1),
// VOP1
!if(!eq(HasSDWAOMod, 0),
- // VOP1_SDWA9 without omod
+ // VOP1_SDWA without omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel),
- // VOP1_SDWA9 with omod
+ // VOP1_SDWA with omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel)),
!if(!eq(NumSrcArgs, 2),
!if(!eq(DstVT.Size, 1),
- // VOPC_SDWA9
+ // VOPC_SDWA
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
- src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA9
+ clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+ // VOP2_SDWA
!if(!eq(HasSDWAOMod, 0),
- // VOP2_SDWA9 without omod
+ // VOP2_SDWA without omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP1_SDWA9 with omod
+ // VOP2_SDWA with omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp, omod:$omod,
@@ -1220,12 +1217,12 @@ class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
(outs)); // V_NOP
}
-// Outs for GFX9 SDWA
-class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {
+// Outs for SDWA
+class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> {
dag ret = !if(HasDst,
!if(!eq(DstVT.Size, 1),
- (outs DstRCSDWA9:$sdst),
- (outs DstRCSDWA9:$vdst)),
+ (outs DstRCSDWA:$sdst),
+ (outs DstRCSDWA:$vdst)),
(outs)); // V_NOP
}
@@ -1387,8 +1384,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field ValueType Src2VT = ArgVT[3];
field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
- field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
- field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;
+ field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1396,19 +1392,15 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
- field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
- field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
- field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
- field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
+ field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
field Operand Src0Mod = getSrcMod<Src0VT>.ret;
field Operand Src1Mod = getSrcMod<Src1VT>.ret;
field Operand Src2Mod = getSrcMod<Src2VT>.ret;
field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
- field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
- field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
- field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;
- field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;
+ field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
+ field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
@@ -1457,8 +1449,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
- field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
- field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;
+ field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
@@ -1471,11 +1462,9 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
- HasModifiers, Src0ModSDWA, Src1ModSDWA,
+ HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
- field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,
- HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,
- DstVT>.ret;
+
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
@@ -1628,13 +1617,13 @@ def getSDWAOp : InstrMapping {
let ValueCols = [["SDWA"]];
}
-// Maps ordinary instructions to their SDWA GFX9 counterparts
-def getSDWA9Op : InstrMapping {
+// Maps SDWA instructions to their ordinary counterparts
+def getBasicFromSDWAOp : InstrMapping {
let FilterClass = "VOP";
let RowFields = ["OpName"];
let ColFields = ["AsmVariantName"];
- let KeyCol = ["Default"];
- let ValueCols = [["SDWA9"]];
+ let KeyCol = ["SDWA"];
+ let ValueCols = [["Default"]];
}
def getMaskedMIMGOp : InstrMapping {
@@ -1669,7 +1658,9 @@ def getMCOpcodeGen : InstrMapping {
let ColFields = ["Subtarget"];
let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
- [!cast<string>(SIEncodingFamily.VI)]];
+ [!cast<string>(SIEncodingFamily.VI)],
+ [!cast<string>(SIEncodingFamily.SDWA)],
+ [!cast<string>(SIEncodingFamily.SDWA9)]];
}
// Get equivalent SOPK instruction.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 18b197ddb7ae7..3203c38dae344 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDX(false),
WorkItemIDY(false),
WorkItemIDZ(false),
- PrivateMemoryInputPtr(false) {
+ ImplicitBufferPtr(false) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function *F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
@@ -86,6 +86,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
ScratchWaveOffsetReg = AMDGPU::SGPR4;
FrameOffsetReg = AMDGPU::SGPR5;
+ StackPtrOffsetReg = AMDGPU::SGPR32;
return;
}
@@ -150,7 +151,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
DispatchID = true;
} else if (ST.isMesaGfxShader(MF)) {
if (HasStackObjects || MaySpill)
- PrivateMemoryInputPtr = true;
+ ImplicitBufferPtr = true;
}
// We don't need to worry about accessing spills with flat instructions.
@@ -203,11 +204,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
return FlatScratchInitUserSGPR;
}
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
- PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+ ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
- return PrivateMemoryPtrUserSGPR;
+ return ImplicitBufferPtrUserSGPR;
}
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fdb8caac6f21..05aa249584bf1 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned StackPtrOffsetReg;
// Input registers for non-HSA ABI
- unsigned PrivateMemoryPtrUserSGPR;
+ unsigned ImplicitBufferPtrUserSGPR;
// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
@@ -179,7 +179,7 @@ private:
// Private memory buffer
// Compute directly in sgpr[0:1]
// Other shaders indirect 64-bits at sgpr[0:1]
- bool PrivateMemoryInputPtr : 1;
+ bool ImplicitBufferPtr : 1;
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
@@ -236,7 +236,7 @@ public:
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
unsigned addDispatchID(const SIRegisterInfo &TRI);
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
- unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
+ unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
// Add system SGPRs.
unsigned addWorkGroupIDX() {
@@ -341,8 +341,8 @@ public:
return WorkItemIDZ;
}
- bool hasPrivateMemoryInputPtr() const {
- return PrivateMemoryInputPtr;
+ bool hasImplicitBufferPtr() const {
+ return ImplicitBufferPtr;
}
unsigned getNumUserSGPRs() const {
@@ -396,8 +396,8 @@ public:
return QueuePtrUserSGPR;
}
- unsigned getPrivateMemoryPtrUserSGPR() const {
- return PrivateMemoryPtrUserSGPR;
+ unsigned getImplicitBufferPtrUserSGPR() const {
+ return ImplicitBufferPtrUserSGPR;
}
bool hasSpilledSGPRs() const {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f4ddf1891683b..4ac23ef03cb32 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -67,9 +67,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
- bool isConvertibleToSDWA(const MachineInstr &MI) const;
+ bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
- void legalizeScalarOperands(MachineInstr &MI) const;
+ void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -224,7 +224,7 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
static bool isSubregOf(const MachineOperand &SubReg,
const MachineOperand &SuperReg,
const TargetRegisterInfo *TRI) {
-
+
if (!SuperReg.isReg() || !SubReg.isReg())
return false;
@@ -557,7 +557,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
+
if (TRI->isPhysicalRegister(Src0->getReg()) ||
TRI->isPhysicalRegister(Dst->getReg()))
break;
@@ -590,7 +590,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
break;
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
+
if (TRI->isPhysicalRegister(Src1->getReg()) ||
TRI->isPhysicalRegister(Dst->getReg()))
break;
@@ -607,16 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
}
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+ const SISubtarget &ST) const {
// Check if this instruction has opcode that supports SDWA
- unsigned Opc = MI.getOpcode();
- if (AMDGPU::getSDWAOp(Opc) != -1)
- return true;
- int Opc32 = AMDGPU::getVOPe32(Opc);
- if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
- return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
- !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
- return false;
+ int Opc = MI.getOpcode();
+ if (AMDGPU::getSDWAOp(Opc) == -1)
+ Opc = AMDGPU::getVOPe32(Opc);
+
+ if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+ return false;
+
+ if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return false;
+
+ if (TII->isVOPC(Opc)) {
+ if (!ST.hasSDWASdst()) {
+ const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if (SDst && SDst->getReg() != AMDGPU::VCC)
+ return false;
+ }
+
+ if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+ return false;
+
+ } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ return false;
+ }
+
+ if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+ Opc == AMDGPU::V_MAC_F32_e32))
+ return false;
+
+ return true;
}
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -641,6 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
if (Dst) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
SDWAInst.add(*Dst);
+ } else {
+ Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ assert(Dst &&
+ AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+ SDWAInst.add(*Dst);
}
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -677,9 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src2);
}
- // Initialize clamp.
+ // Copy clamp if present, initialize otherwise
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
- SDWAInst.addImm(0);
+ MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
+ if (Clamp) {
+ SDWAInst.add(*Clamp);
+ } else {
+ SDWAInst.addImm(0);
+ }
+
+ // Copy omod if present, initialize otherwise if needed
+ MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+ if (OMod) {
+ assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
+ SDWAInst.add(*OMod);
+ } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+ SDWAInst.addImm(0);
+ }
// Initialize dst_sel and dst_unused if present
if (Dst) {
@@ -733,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
}
// If an instruction was converted to SDWA it should not have immediates or SGPR
-// operands. Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
+// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
- for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
- MachineOperand &Op = MI.getOperand(I);
+ unsigned ConstantBusCount = 0;
+ for (MachineOperand &Op: MI.explicit_uses()) {
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
continue;
+
+ unsigned I = MI.getOperandNo(&Op);
if (Desc.OpInfo[I].RegClass == -1 ||
!TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
continue;
+
+ if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
+ TRI->isSGPRReg(*MRI, Op.getReg())) {
+ ++ConstantBusCount;
+ continue;
+ }
+
unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
@@ -758,22 +808,20 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (!ST.hasSDWA() ||
- !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+ if (!ST.hasSDWA())
return false;
- }
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
-
+
// Find all SDWA operands in MF.
matchSDWAOperands(MF);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
@@ -788,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
bool Ret = !ConvertedInstructions.empty();
while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
return Ret;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b611f28fcabdf..ef6ad4ad0c8f3 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1044,18 +1044,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned CarryOut
= MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned ScaledReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // XXX - Should this use a vector shift?
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
- .addReg(DiffReg, RegState::Kill)
- .addImm(Log2_32(ST.getWavefrontSize()));
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
+ .addImm(Log2_32(ST.getWavefrontSize()))
+ .addReg(DiffReg, RegState::Kill);
// TODO: Fold if use instruction is another add of a constant.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
- .addReg(CarryOut, RegState::Define | RegState::Dead)
- .addImm(Offset)
- .addReg(ScaledReg, RegState::Kill);
+ if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+ .addReg(CarryOut, RegState::Define | RegState::Dead)
+ .addImm(Offset)
+ .addReg(ScaledReg, RegState::Kill);
+ } else {
+ unsigned ConstOffsetReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+ .addReg(CarryOut, RegState::Define | RegState::Dead)
+ .addReg(ConstOffsetReg, RegState::Kill)
+ .addReg(ScaledReg, RegState::Kill);
+ }
MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
}
@@ -1341,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- if (ST.isAmdCodeObjectV2(MF)) {
- assert(MFI->hasPrivateSegmentBuffer());
- return MFI->PrivateSegmentBufferUserSGPR;
- }
- assert(MFI->hasPrivateMemoryInputPtr());
- return MFI->PrivateMemoryPtrUserSGPR;
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
+ assert(MFI->hasImplicitBufferPtr());
+ return MFI->ImplicitBufferPtrUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 8fed6d5f9710f..600cc886cb595 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -197,12 +197,13 @@ public:
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+ IMPLICIT_BUFFER_PTR = 15,
// VGPRS:
- FIRST_VGPR_VALUE = 15,
+ FIRST_VGPR_VALUE = 16,
WORKITEM_ID_X = FIRST_VGPR_VALUE,
- WORKITEM_ID_Y = 16,
- WORKITEM_ID_Z = 17
+ WORKITEM_ID_Y = 17,
+ WORKITEM_ID_Z = 18
};
/// \brief Returns the physical register that \p Value is stored in.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c5f121757e623..96a18544f02ac 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_SUBB_U32_e64:
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+ return false;
// Additional verification is needed for sdst/src2.
return true;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f581e69980c79..26515b27bb77d 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -538,6 +538,27 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
Reg == AMDGPU::SCC;
}
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
+
+ if (Reg0 == Reg1) {
+ return true;
+ }
+
+ unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
+ if (SubReg0 == 0) {
+ return TRI->getSubRegIndex(Reg1, Reg0) > 0;
+ }
+
+ for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
+ if (isRegIntersect(Reg1, SubReg0, TRI)) {
+ return true;
+ }
+ SubReg0 = TRI->getSubReg(Reg0, Idx);
+ }
+
+ return false;
+}
+
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
switch(Reg) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index eff0230d21f57..936e4921a7097 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -271,6 +271,9 @@ bool isGFX9(const MCSubtargetInfo &STI);
/// \brief Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
+/// \brief Is there any intersection between registers
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
+
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 95b5ef0a49dba..96b33c373f052 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -93,11 +93,6 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP1";
}
-class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- VOP_SDWA9_Pseudo <OpName, P, pattern> {
- let AsmMatchConverter = "cvtSdwaVOP1";
-}
-
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret =
!if(P.HasModifiers,
@@ -117,7 +112,6 @@ multiclass VOP1Inst <string opName, VOPProfile P,
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
- def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;
}
// Special profile for instructions which have clamp
@@ -274,12 +268,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
- let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel);
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0, 1>.ret;
@@ -545,8 +537,8 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,
- VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 657cacaa792ca..7b9bc71ad4c77 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -114,11 +114,6 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP2";
}
-class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- VOP_SDWA9_Pseudo <OpName, P, pattern> {
- let AsmMatchConverter = "cvtSdwaVOP2";
-}
-
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst,
@@ -139,7 +134,6 @@ multiclass VOP2Inst <string opName,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
- def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;
}
multiclass VOP2bInst <string opName,
@@ -156,10 +150,6 @@ multiclass VOP2bInst <string opName,
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
-
- def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
- }
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -221,17 +211,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
VGPR_32:$src2, // stub argument
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
- VGPR_32:$src2, // stub argument
- clampmod:$clamp, omod:$omod,
- dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
let Asm32 = getAsm32<1, 2, vt>.ret;
let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
@@ -289,15 +275,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
- clampmod:$clamp, omod:$omod,
- dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
-
let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
Src1Mod:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
@@ -326,6 +307,8 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
+ let HasExt = 0;
+ let HasSDWA9 = 0;
}
def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
@@ -335,6 +318,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
+ let HasExt = 0;
+ let HasSDWA9 = 0;
}
//===----------------------------------------------------------------------===//
@@ -397,20 +382,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
} // End isConvergent = 1
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
} // End SubtargetPredicate = isGCN
+def : Pat<
+ (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
+ (V_ADDC_U32_e64 $src0, $src1, $src2)
+>;
+
+def : Pat<
+ (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
+ (V_SUBB_U32_e64 $src0, $src1, $src2)
+>;
// These instructions only exist on SI and CI
let SubtargetPredicate = isSICI in {
@@ -728,8 +722,8 @@ multiclass VOP2_SDWA_Real <bits<6> op> {
multiclass VOP2_SDWA9_Real <bits<6> op> {
def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,
- VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index cd347b86d3050..f3482a22d5dcd 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -113,11 +113,6 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOPC";
}
-class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- VOP_SDWA9_Pseudo <OpName, P, pattern> {
- let AsmMatchConverter = "cvtSdwaVOPC";
-}
-
// This class is used only with VOPC instructions. Use $sdst for out operand
class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
@@ -189,13 +184,6 @@ multiclass VOPC_Pseudos <string opName,
let isConvergent = DefExec;
let isCompare = 1;
}
-
- def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let SchedRW = P.Schedule;
- let isConvergent = DefExec;
- let isCompare = 1;
- }
}
def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
@@ -540,14 +528,12 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
VOPC_Profile<sched, vt, i32> {
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
let Asm64 = "$sdst, $src0_modifiers, $src1";
+
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
+
let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
- //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
let HasSrc1Mods = 0;
let HasClamp = 0;
let HasOMod = 0;
@@ -580,12 +566,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
-
- def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let SchedRW = p.Schedule;
- let isConvergent = DefExec;
- }
}
def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
@@ -954,8 +934,8 @@ multiclass VOPC_Real_vi <bits<10> op> {
VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,
- VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+ VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
!cast<Instruction>(NAME#"_e32_vi")> {
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 4da654f84f9d1..e386f21c2ba49 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -232,11 +232,11 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
- let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
- let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
- let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+ let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+ let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -245,8 +245,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0)
- let Inst{60} = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -300,6 +300,19 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
}
+// GFX9 adds two features to SDWA:
+// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+// than VGPRs (at most 1 can be an SGPR);
+// b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+// replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+// field.
+// a. When SD=1, the SDST is used as the destination for the compare result;
+// b. When SD=0, VCC is used.
+//
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
// gfx9 SDWA basic encoding
class VOP_SDWA9e<VOPProfile P> : Enc64 {
bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
@@ -353,6 +366,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
string Mnemonic = opName;
string AsmOperands = P.AsmSDWA;
+ string AsmOperands9 = P.AsmSDWA9;
let Size = 8;
let mayLoad = 0;
@@ -372,53 +386,9 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
VOPProfile Pfl = P;
}
-// GFX9 adds two features to SDWA:
-// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
-// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
-// than VGPRs (at most 1 can be an SGPR);
-// b. OMOD is the standard output modifier (result *2, *4, /2)
-// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This
-// replaces OMOD and the dest fields with SD and SDST (SGPR destination)
-// field.
-// a. When SD=1, the SDST is used as the destination for the compare result;
-// b.when SD=0, VCC is used.
-//
-// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
-
-class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
- InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,
- VOP <opName>,
- SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,
- MnemonicAlias <opName#"_sdwa9", opName> {
-
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
-
- string Mnemonic = opName;
- string AsmOperands = P.AsmSDWA9;
-
- let Size = 8;
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
-
- let VALU = 1;
- let SDWA = 1;
- let Uses = [EXEC];
-
- let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
- let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
- let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,
- AMDGPUAsmVariants.Disable);
- let DecoderNamespace = "SDWA9";
-
- VOPProfile Pfl = P;
-}
-
class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -443,9 +413,9 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
-class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -458,13 +428,15 @@ class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
+ let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+ let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+ let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "SDWA9";
+
// Copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AssemblerPredicate = ps.AssemblerPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
- let AsmVariantName = ps.AsmVariantName;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let DecoderNamespace = ps.DecoderNamespace;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index ca68f5d42c32c..6f67183df6a18 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -100,7 +100,8 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
"Enable Reliability, Availability and Serviceability extensions">;
def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
"Enable fast computation of positive address offsets">;
-
+def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+ "CPU fuses AES crypto operations">;
// Cyclone has preferred instructions for zeroing VFP registers, which can
// execute in 0 cycles.
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index f9da036c7e468..90f635c812542 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1504,6 +1504,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case ARM::CONSTPOOL_ENTRY: {
+ if (Subtarget->genExecuteOnly())
+ llvm_unreachable("execute-only should not generate constant pools");
+
/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
/// in the function. The first operand is the ID# for this instruction, the
/// second is the index into the MachineConstantPool that this is, the third
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 8715657ad5e25..e0810c358f2da 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -665,12 +665,14 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
const ARMFunctionInfo *AFI =
MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
+ // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM.
+ // In their ARM encoding, they can't be encoded in a conditional form.
+ if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+ return false;
+
if (AFI->isThumb2Function()) {
if (getSubtarget().restrictIT())
return isV8EligibleForIT(&MI);
- } else { // non-Thumb
- if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
- return false;
}
return true;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 5b2d093e8f0da..2bcc707e9fc3c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2669,12 +2669,35 @@ static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOVi.
-static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
// FIXME there is no actual debug info here
SDLoc dl(Op);
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
SDValue Res;
+
+ // When generating execute-only code Constant Pools must be promoted to the
+ // global data section. It's a bit ugly that we can't share them across basic
+ // blocks, but this way we guarantee that execute-only behaves correct with
+ // position-independent addressing modes.
+ if (Subtarget->genExecuteOnly()) {
+ auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+ auto T = const_cast<Type*>(CP->getType());
+ auto C = const_cast<Constant*>(CP->getConstVal());
+ auto M = const_cast<Module*>(DAG.getMachineFunction().
+ getFunction()->getParent());
+ auto GV = new GlobalVariable(
+ *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
+ Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
+ Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
+ Twine(AFI->createPICLabelUId())
+ );
+ SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
+ dl, PtrVT);
+ return LowerGlobalAddress(GA, DAG);
+ }
+
if (CP->isMachineConstantPoolEntry())
Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
CP->getAlignment());
@@ -3118,6 +3141,19 @@ static bool isReadOnly(const GlobalValue *GV) {
isa<Function>(GV);
}
+SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Subtarget->getTargetTriple().getObjectFormat()) {
+ default: llvm_unreachable("unknown object format");
+ case Triple::COFF:
+ return LowerGlobalAddressWindows(Op, DAG);
+ case Triple::ELF:
+ return LowerGlobalAddressELF(Op, DAG);
+ case Triple::MachO:
+ return LowerGlobalAddressDarwin(Op, DAG);
+ }
+}
+
SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -7634,21 +7670,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Don't know how to custom lower this!");
case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
- case ISD::ConstantPool:
- if (Subtarget->genExecuteOnly())
- llvm_unreachable("execute-only should not generate constant pools");
- return LowerConstantPool(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
- case ISD::GlobalAddress:
- switch (Subtarget->getTargetTriple().getObjectFormat()) {
- default: llvm_unreachable("unknown object format");
- case Triple::COFF:
- return LowerGlobalAddressWindows(Op, DAG);
- case Triple::ELF:
- return LowerGlobalAddressELF(Op, DAG);
- case Triple::MachO:
- return LowerGlobalAddressDarwin(Op, DAG);
- }
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 26da528c19e6d..5044134f5b1e2 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -601,6 +601,8 @@ class InstrItineraryData;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 45471a4e95b39..53db5acbe805c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -4756,6 +4756,16 @@ def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
(ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+// Aliases for the above with the .w qualifier
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+ (t2MOVsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+ (t2MOVSsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+ (t2MOVsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+ (t2MOVSsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
// ADR w/o the .w suffix
def : t2InstAlias<"adr${p} $Rd, $addr",
(t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ae3bad4076b0..4cb0eca5ee5f8 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -42,6 +42,10 @@ public:
private:
bool selectImpl(MachineInstr &I) const;
+ bool selectICmp(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+ MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) const;
+
const ARMBaseInstrInfo &TII;
const ARMBaseRegisterInfo &TRI;
const ARMBaseTargetMachine &TM;
@@ -243,6 +247,105 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
return Opc;
}
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+ switch (Pred) {
+ // Needs two compares...
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UEQ:
+ default:
+ // AL is our "false" for now. The other two need more compares.
+ return ARMCC::AL;
+ case CmpInst::ICMP_EQ:
+ case CmpInst::FCMP_OEQ:
+ return ARMCC::EQ;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::FCMP_OGT:
+ return ARMCC::GT;
+ case CmpInst::ICMP_SGE:
+ case CmpInst::FCMP_OGE:
+ return ARMCC::GE;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::FCMP_UGT:
+ return ARMCC::HI;
+ case CmpInst::FCMP_OLT:
+ return ARMCC::MI;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::FCMP_OLE:
+ return ARMCC::LS;
+ case CmpInst::FCMP_ORD:
+ return ARMCC::VC;
+ case CmpInst::FCMP_UNO:
+ return ARMCC::VS;
+ case CmpInst::FCMP_UGE:
+ return ARMCC::PL;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::FCMP_ULT:
+ return ARMCC::LT;
+ case CmpInst::ICMP_SLE:
+ case CmpInst::FCMP_ULE:
+ return ARMCC::LE;
+ case CmpInst::FCMP_UNE:
+ case CmpInst::ICMP_NE:
+ return ARMCC::NE;
+ case CmpInst::ICMP_UGE:
+ return ARMCC::HS;
+ case CmpInst::ICMP_ULT:
+ return ARMCC::LO;
+ }
+}
+
+bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB,
+ const ARMBaseInstrInfo &TII,
+ MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) const {
+ auto &MBB = *MIB->getParent();
+ auto InsertBefore = std::next(MIB->getIterator());
+ auto &DebugLoc = MIB->getDebugLoc();
+
+ // Move 0 into the result register.
+ auto Mov0I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVi))
+ .addDef(MRI.createVirtualRegister(&ARM::GPRRegClass))
+ .addImm(0)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ if (!constrainSelectedInstRegOperands(*Mov0I, TII, TRI, RBI))
+ return false;
+
+ // Perform the comparison.
+ auto LHSReg = MIB->getOperand(2).getReg();
+ auto RHSReg = MIB->getOperand(3).getReg();
+ assert(MRI.getType(LHSReg) == MRI.getType(RHSReg) &&
+ MRI.getType(LHSReg).getSizeInBits() == 32 &&
+ MRI.getType(RHSReg).getSizeInBits() == 32 &&
+ "Unsupported types for comparison operation");
+ auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPrr))
+ .addUse(LHSReg)
+ .addUse(RHSReg)
+ .add(predOps(ARMCC::AL));
+ if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+ return false;
+
+ // Move 1 into the result register if the flags say so.
+ auto ResReg = MIB->getOperand(0).getReg();
+ auto Cond =
+ static_cast<CmpInst::Predicate>(MIB->getOperand(1).getPredicate());
+ auto ARMCond = getComparePred(Cond);
+ if (ARMCond == ARMCC::AL)
+ return false;
+
+ auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCi))
+ .addDef(ResReg)
+ .addUse(Mov0I->getOperand(0).getReg())
+ .addImm(1)
+ .add(predOps(ARMCond, ARM::CPSR));
+ if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+ return false;
+
+ MIB->eraseFromParent();
+ return true;
+}
+
bool ARMInstructionSelector::select(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -343,6 +446,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
I.setDesc(TII.get(COPY));
return selectCopy(I, TII, MRI, TRI, RBI);
}
+ case G_ICMP:
+ return selectICmp(MIB, TII, MRI, TRI, RBI);
case G_GEP:
I.setDesc(TII.get(ARM::ADDrr));
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index a706079d98662..5873c7fb38729 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -86,6 +86,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({G_CONSTANT, s32}, Legal);
+ setAction({G_ICMP, s1}, Legal);
+ for (auto Ty : {s8, s16})
+ setAction({G_ICMP, 1, Ty}, WidenScalar);
+ for (auto Ty : {s32, p0})
+ setAction({G_ICMP, 1, Ty}, Legal);
+
if (!ST.useSoftFloat() && ST.hasVFP2()) {
setAction({G_FADD, s32}, Legal);
setAction({G_FADD, s64}, Legal);
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
new file mode 100644
index 0000000000000..1b6e97c28d453
--- /dev/null
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -0,0 +1,57 @@
+//===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMacroFusion.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI);
+
+ // Assume wildcards for unspecified instrs.
+ unsigned FirstOpcode =
+ FirstMI ? FirstMI->getOpcode()
+ : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ if (ST.hasFuseAES())
+ // Fuse AES crypto operations.
+ switch(SecondOpcode) {
+ // AES encode.
+ case ARM::AESMC :
+ return FirstOpcode == ARM::AESE ||
+ FirstOpcode == ARM::INSTRUCTION_LIST_END;
+ // AES decode.
+ case ARM::AESIMC:
+ return FirstOpcode == ARM::AESD ||
+ FirstOpcode == ARM::INSTRUCTION_LIST_END;
+ }
+
+ return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation () {
+ return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/ARM/ARMMacroFusion.h b/lib/Target/ARM/ARMMacroFusion.h
new file mode 100644
index 0000000000000..1e4fc6687eae8
--- /dev/null
+++ b/lib/Target/ARM/ARMMacroFusion.h
@@ -0,0 +1,24 @@
+//===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+/// DAG.addMutation(createARMMacroFusionDAGMutation());
+/// to ARMPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation();
+
+} // llvm
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index f59b075e6dd9a..2350d0c6ef69e 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -255,6 +255,16 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OperandsMapping =
getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
break;
+ case G_ICMP: {
+ LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
+ (void)Ty2;
+ assert(Ty2.getSizeInBits() == 32 && "Unsupported size for G_ICMP");
+ OperandsMapping =
+ getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+ &ARM::ValueMappings[ARM::GPR3OpsIdx],
+ &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+ break;
+ }
case G_MERGE_VALUES: {
// We only support G_MERGE_VALUES for creating a double precision floating
// point value out of two GPRs.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index af682dd8321cf..d890d0fa777e8 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -285,6 +285,10 @@ protected:
/// HasFPAO - if true, processor does positive address offset computation faster
bool HasFPAO = false;
+ /// HasFuseAES - if true, processor executes back to back AES instruction
+ /// pairs faster.
+ bool HasFuseAES = false;
+
/// If true, if conversion may decide to leave some instructions unpredicated.
bool IsProfitableToUnpredicate = false;
@@ -561,6 +565,10 @@ public:
bool hasD16() const { return HasD16; }
bool hasFullFP16() const { return HasFullFP16; }
+ bool hasFuseAES() const { return HasFuseAES; }
+ /// \brief Return true if the CPU supports any kind of instruction fusion.
+ bool hasFusion() const { return hasFuseAES(); }
+
const Triple &getTargetTriple() const { return TargetTriple; }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index c0506cfda6129..eb71e557ec917 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -17,6 +17,7 @@
#include "ARMRegisterBankInfo.h"
#endif
#include "ARMSubtarget.h"
+#include "ARMMacroFusion.h"
#include "ARMTargetMachine.h"
#include "ARMTargetObjectFile.h"
#include "ARMTargetTransformInfo.h"
@@ -394,6 +395,9 @@ public:
createMachineScheduler(MachineSchedContext *C) const override {
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
// add DAG Mutations here.
+ const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+ if (ST.hasFusion())
+ DAG->addMutation(createARMMacroFusionDAGMutation());
return DAG;
}
@@ -401,6 +405,9 @@ public:
createPostMachineScheduler(MachineSchedContext *C) const override {
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
// add DAG Mutations here.
+ const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+ if (ST.hasFusion())
+ DAG->addMutation(createARMMacroFusionDAGMutation());
return DAG;
}
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 19fba3033bb2b..891b5c60e1fd6 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6860,6 +6860,17 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
bool ARMAsmParser::processInstruction(MCInst &Inst,
const OperandVector &Operands,
MCStreamer &Out) {
+ // Check if we have the wide qualifier, because if it's present we
+ // must avoid selecting a 16-bit thumb instruction.
+ bool HasWideQualifier = false;
+ for (auto &Op : Operands) {
+ ARMOperand &ARMOp = static_cast<ARMOperand&>(*Op);
+ if (ARMOp.isToken() && ARMOp.getToken() == ".w") {
+ HasWideQualifier = true;
+ break;
+ }
+ }
+
switch (Inst.getOpcode()) {
// Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
case ARM::LDRT_POST:
@@ -6939,8 +6950,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
// Select the narrow version if the immediate will fit.
if (Inst.getOperand(1).getImm() > 0 &&
Inst.getOperand(1).getImm() <= 0xff &&
- !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
- static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+ !HasWideQualifier)
Inst.setOpcode(ARM::tLDRpci);
else
Inst.setOpcode(ARM::t2LDRpci);
@@ -6971,10 +6981,9 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
else if (Inst.getOpcode() == ARM::t2LDRConstPool)
TmpInst.setOpcode(ARM::t2LDRpci);
const ARMOperand &PoolOperand =
- (static_cast<ARMOperand &>(*Operands[2]).isToken() &&
- static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ?
- static_cast<ARMOperand &>(*Operands[4]) :
- static_cast<ARMOperand &>(*Operands[3]);
+ (HasWideQualifier ?
+ static_cast<ARMOperand &>(*Operands[4]) :
+ static_cast<ARMOperand &>(*Operands[3]));
const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
// If SubExprVal is a constant we may be able to use a MOV
if (isa<MCConstantExpr>(SubExprVal) &&
@@ -8117,8 +8126,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
isARMLowRegister(Inst.getOperand(1).getReg()) &&
Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
- !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
- static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+ !HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
default: llvm_unreachable("unexpected opcode");
@@ -8152,7 +8160,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
isARMLowRegister(Inst.getOperand(1).getReg()) &&
isARMLowRegister(Inst.getOperand(2).getReg()) &&
Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
- inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+ inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr) &&
+ !HasWideQualifier)
isNarrow = true;
MCInst TmpInst;
unsigned newOpc;
@@ -8186,7 +8195,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
bool isNarrow = false;
if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
isARMLowRegister(Inst.getOperand(1).getReg()) &&
- inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+ inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi) &&
+ !HasWideQualifier)
isNarrow = true;
MCInst TmpInst;
unsigned newOpc;
@@ -8415,10 +8425,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
!isARMLowRegister(Inst.getOperand(0).getReg()) ||
(Inst.getOperand(2).isImm() &&
(unsigned)Inst.getOperand(2).getImm() > 255) ||
- ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
- (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
- (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
- static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+ Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+ HasWideQualifier)
break;
MCInst TmpInst;
TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
@@ -8447,8 +8455,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
}
if (!Transform ||
Inst.getOperand(5).getReg() != 0 ||
- (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
- static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+ HasWideQualifier)
break;
MCInst TmpInst;
TmpInst.setOpcode(ARM::tADDhirr);
@@ -8568,11 +8575,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
(Inst.getOperand(1).isImm() &&
(unsigned)Inst.getOperand(1).getImm() <= 255) &&
- ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
- Inst.getOperand(4).getReg() == ARM::CPSR) ||
- (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
- (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
- static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+ Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ !HasWideQualifier) {
// The operands aren't in the same order for tMOVi8...
MCInst TmpInst;
TmpInst.setOpcode(ARM::tMOVi8);
@@ -8593,8 +8597,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
isARMLowRegister(Inst.getOperand(1).getReg()) &&
Inst.getOperand(2).getImm() == ARMCC::AL &&
Inst.getOperand(4).getReg() == ARM::CPSR &&
- (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
- static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+ !HasWideQualifier) {
// The operands aren't the same for tMOV[S]r... (no cc_out)
MCInst TmpInst;
TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
@@ -8616,8 +8619,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
isARMLowRegister(Inst.getOperand(1).getReg()) &&
Inst.getOperand(2).getImm() == 0 &&
- (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
- static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+ !HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
default: llvm_unreachable("Illegal opcode!");
@@ -8716,11 +8718,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
isARMLowRegister(Inst.getOperand(2).getReg())) &&
Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
- ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
- (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
- (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
- !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
- ".w"))) {
+ Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ !HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
default: llvm_unreachable("unexpected opcode");
@@ -8756,11 +8755,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
isARMLowRegister(Inst.getOperand(2).getReg())) &&
(Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
- ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
- (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
- (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
- !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
- ".w"))) {
+ Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ !HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
default: llvm_unreachable("unexpected opcode");
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 3cde43967568b..cf6827fd6ca19 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,6 +49,7 @@ add_llvm_target(ARMCodeGen
ARMLoadStoreOptimizer.cpp
ARMMCInstLower.cpp
ARMMachineFunctionInfo.cpp
+ ARMMacroFusion.cpp
ARMRegisterInfo.cpp
ARMOptimizeBarriersPass.cpp
ARMSelectionDAGInfo.cpp
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 716492ea25662..81760f03940ad 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -358,11 +358,27 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
return Value;
}
-unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target, uint64_t Value,
bool IsPCRel, MCContext &Ctx,
bool IsLittleEndian,
bool IsResolved) const {
unsigned Kind = Fixup.getKind();
+
+ // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT
+ // and .word relocations they put the Thumb bit into the addend if possible.
+ // Other relocation types don't want this bit though (branches couldn't encode
+ // it if it *was* present, and no other relocations exist) and it can
+ // interfere with checking valid expressions.
+ if (const MCSymbolRefExpr *A = Target.getSymA()) {
+ if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) &&
+ (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 ||
+ Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 ||
+ Kind == ARM::fixup_t2_movt_hi16))
+ Value |= 1;
+ }
+
switch (Kind) {
default:
Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
@@ -505,6 +521,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
return swapHalfWords(out, IsLittleEndian);
}
case ARM::fixup_arm_thumb_bl: {
+ // FIXME: We get both thumb1 and thumb2 in here, so we can only check for
+ // the less strict thumb2 value.
+ if (!isInt<26>(Value - 4)) {
+ Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+ return 0;
+ }
+
// The value doesn't encode the low bit (always zero) and is offset by
// four. The 32-bit immediate value is encoded as
// imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
@@ -716,29 +739,11 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
const MCFixup &Fixup,
- const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) {
+ const MCValue &Target, bool &IsResolved) {
const MCSymbolRefExpr *A = Target.getSymA();
const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
const unsigned FixupKind = Fixup.getKind() ;
- // MachO (the only user of "Value") tries to make .o files that look vaguely
- // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit
- // into the addend if possible. Other relocation types don't want this bit
- // though (branches couldn't encode it if it *was* present, and no other
- // relocations exist) and it can interfere with checking valid expressions.
- if (FixupKind == FK_Data_4 ||
- FixupKind == ARM::fixup_arm_movw_lo16 ||
- FixupKind == ARM::fixup_arm_movt_hi16 ||
- FixupKind == ARM::fixup_t2_movw_lo16 ||
- FixupKind == ARM::fixup_t2_movt_hi16) {
- if (Sym) {
- if (Asm.isThumbFunc(Sym))
- Value |= 1;
- }
- }
if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
assert(Sym && "How did we resolve this?");
@@ -747,7 +752,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
// If the symbol is out of range, produce a relocation and hope the
// linker can handle it. GNU AS produces an error in this case.
- if (Sym->isExternal() || Value >= 0x400004)
+ if (Sym->isExternal())
IsResolved = false;
}
// Create relocations for unconditional branches to function symbols with
@@ -759,6 +764,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
IsResolved = false;
if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
FixupKind == ARM::fixup_arm_thumb_bl ||
+ FixupKind == ARM::fixup_t2_condbranch ||
FixupKind == ARM::fixup_t2_uncondbranch))
IsResolved = false;
}
@@ -875,22 +881,25 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
}
}
-void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value, bool IsPCRel,
- MCContext &Ctx) const {
+void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- Value = adjustFixupValue(Fixup, Value, IsPCRel, Ctx, IsLittleEndian, true);
+ MCContext &Ctx = Asm.getContext();
+ Value = adjustFixupValue(Asm, Fixup, Target, Value, IsPCRel, Ctx,
+ IsLittleEndian, true);
if (!Value)
return; // Doesn't change encoding.
unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FullSizeBytes;
if (!IsLittleEndian) {
FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
- assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+ assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 2ddedb5d61059..6a0ba2ed41c1a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -40,17 +40,17 @@ public:
/// processFixupValue - Target hook to process the literal value of a fixup
/// if necessary.
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override;
+ void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, bool &IsResolved) override;
- unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+ unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, uint64_t Value, bool IsPCRel,
MCContext &Ctx, bool IsLittleEndian,
bool IsResolved) const;
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
unsigned getRelaxedOpcode(unsigned Op) const;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 00505a103e00f..f74fb2e20b5a3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -33,8 +33,8 @@ public:
~ARMWinCOFFObjectWriter() override = default;
- unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
- bool IsCrossSection,
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsCrossSection,
const MCAsmBackend &MAB) const override;
bool recordRelocation(const MCFixup &) const override;
@@ -42,7 +42,8 @@ public:
} // end anonymous namespace
-unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
const MCFixup &Fixup,
bool IsCrossSection,
const MCAsmBackend &MAB) const {
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 7d5fb6ca17b98..c6ddd6bdad5e6 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -214,7 +214,12 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
if (Opcode != ISD::LOAD)
continue;
- unsigned char new_val[8]; // hold up the constant values replacing loads.
+ union {
+ uint8_t c[8];
+ uint16_t s;
+ uint32_t i;
+ uint64_t d;
+ } new_val; // hold up the constant values replacing loads.
bool to_replace = false;
SDLoc DL(Node);
const LoadSDNode *LD = cast<LoadSDNode>(Node);
@@ -242,7 +247,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
if (GADN && CDN)
to_replace =
- getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val);
+ getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
} else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
LDAddrNode->getNumOperands() > 0) {
DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
@@ -250,7 +255,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
SDValue OP1 = LDAddrNode->getOperand(0);
if (const GlobalAddressSDNode *GADN =
dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
- to_replace = getConstantFieldValue(GADN, 0, size, new_val);
+ to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
}
if (!to_replace)
@@ -259,13 +264,13 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
// replacing the old with a new value
uint64_t val;
if (size == 1)
- val = *(uint8_t *)new_val;
+ val = new_val.c[0];
else if (size == 2)
- val = *(uint16_t *)new_val;
+ val = new_val.s;
else if (size == 4)
- val = *(uint32_t *)new_val;
+ val = new_val.i;
else {
- val = *(uint64_t *)new_val;
+ val = new_val.d;
}
DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
@@ -318,14 +323,17 @@ bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
}
// test whether host endianness matches target
- uint8_t test_buf[2];
+ union {
+ uint8_t c[2];
+ uint16_t s;
+ } test_buf;
uint16_t test_val = 0x2345;
if (DL.isLittleEndian())
- support::endian::write16le(test_buf, test_val);
+ support::endian::write16le(test_buf.c, test_val);
else
- support::endian::write16be(test_buf, test_val);
+ support::endian::write16be(test_buf.c, test_val);
- bool endian_match = *(uint16_t *)test_buf == test_val;
+ bool endian_match = test_buf.s == test_val;
for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++)
ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j];
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 80357a63a4e12..15e89fb2a2611 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -27,8 +27,9 @@ public:
: MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
~BPFAsmBackend() override = default;
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
@@ -61,9 +62,10 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
return true;
}
-void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value, bool IsPCRel,
- MCContext &Ctx) const {
+void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
assert(Value == 0);
} else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 9f8c9ded8127b..734f3c6658d92 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -567,8 +567,19 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
}
llvm_unreachable("Invalid register operand");
}
- if (SO.isImm() || SO.isFPImm())
- return IfTrue ? C2_cmoveit : C2_cmoveif;
+ switch (SO.getType()) {
+ case MachineOperand::MO_Immediate:
+ case MachineOperand::MO_FPImmediate:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_TargetIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_BlockAddress:
+ return IfTrue ? C2_cmoveit : C2_cmoveif;
+ default:
+ break;
+ }
llvm_unreachable("Unexpected source operand");
}
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 18e49c69b8e36..2b0ceaa66258e 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1051,10 +1051,26 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
bool HasExtraAlign = HRI.needsStackRealignment(MF);
bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
+ unsigned FrameSize = MFI.getStackSize();
unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
unsigned AP = HMFI.getStackAlignBasePhysReg();
- unsigned FrameSize = MFI.getStackSize();
+ // It may happen that AP will be absent even HasAlloca && HasExtraAlign
+ // is true. HasExtraAlign may be set because of vector spills, without
+ // aligned locals or aligned outgoing function arguments. Since vector
+ // spills will ultimately be "unaligned", it is safe to use FP as the
+ // base register.
+ // In fact, in such a scenario the stack is actually not required to be
+ // aligned, although it may end up being aligned anyway, since this
+ // particular case is not easily detectable. The alignment will be
+ // unnecessary, but not incorrect.
+ // Unfortunately there is no quick way to verify that the above is
+ // indeed the case (and that it's not a result of an error), so just
+ // assume that missing AP will be replaced by FP.
+ // (A better fix would be to rematerialize AP from FP and always align
+ // vector spills.)
+ if (AP == 0)
+ AP = FP;
bool UseFP = false, UseAP = false; // Default: use SP (except at -O0).
// Use FP at -O0, except when there are objects with extra alignment.
@@ -2454,9 +2470,44 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF);
auto &HST = MF.getSubtarget<HexagonSubtarget>();
// A fairly simplistic guess as to whether a potential load/store to a
- // stack location could require an extra register. It does not account
- // for store-immediate instructions.
- if (HST.useHVXOps())
- return StackSize > 256;
+ // stack location could require an extra register.
+ if (HST.useHVXOps() && StackSize > 256)
+ return true;
+
+ // Check if the function has store-immediate instructions that access
+ // the stack. Since the offset field is not extendable, if the stack
+ // size exceeds the offset limit (6 bits, shifted), the stores will
+ // require a new base register.
+ bool HasImmStack = false;
+ unsigned MinLS = ~0u; // Log_2 of the memory access size.
+
+ for (const MachineBasicBlock &B : MF) {
+ for (const MachineInstr &MI : B) {
+ unsigned LS = 0;
+ switch (MI.getOpcode()) {
+ case Hexagon::S4_storeirit_io:
+ case Hexagon::S4_storeirif_io:
+ case Hexagon::S4_storeiri_io:
+ ++LS;
+ LLVM_FALLTHROUGH;
+ case Hexagon::S4_storeirht_io:
+ case Hexagon::S4_storeirhf_io:
+ case Hexagon::S4_storeirh_io:
+ ++LS;
+ LLVM_FALLTHROUGH;
+ case Hexagon::S4_storeirbt_io:
+ case Hexagon::S4_storeirbf_io:
+ case Hexagon::S4_storeirb_io:
+ if (MI.getOperand(0).isFI())
+ HasImmStack = true;
+ MinLS = std::min(MinLS, LS);
+ break;
+ }
+ }
+ }
+
+ if (HasImmStack)
+ return !isUInt<6>(StackSize >> MinLS);
+
return false;
}
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index 11ac5454f6043..5abbcbba72ddd 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -295,15 +296,12 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
bool Failure = false, CanUp = true, CanDown = true;
- bool Used1 = false, Used2 = false;
for (unsigned X = MinX+1; X < MaxX; X++) {
const DefUseInfo &DU = DUM.lookup(X);
if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
Failure = true;
break;
}
- Used1 |= DU.Uses[SR1];
- Used2 |= DU.Uses[SR2];
if (CanDown && DU.Defs[SR1])
CanDown = false;
if (CanUp && DU.Defs[SR2])
@@ -317,64 +315,52 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
// Prefer "down", since this will move the MUX farther away from the
// predicate definition.
MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
- if (CanDown) {
- // If the MUX is placed "down", we need to make sure that there aren't
- // any kills of the source registers between the two defs.
- if (Used1 || Used2) {
- auto ResetKill = [this] (unsigned Reg, MachineInstr &MI) -> bool {
- if (MachineOperand *Op = MI.findRegisterUseOperand(Reg, true, HRI)) {
- Op->setIsKill(false);
- return true;
- }
- return false;
- };
- bool KilledSR1 = false, KilledSR2 = false;
- for (MachineInstr &MJ : make_range(std::next(It1), It2)) {
- if (SR1)
- KilledSR1 |= ResetKill(SR1, MJ);
- if (SR2)
- KilledSR2 |= ResetKill(SR1, MJ);
- }
- // If any of the source registers were killed in this range, transfer
- // the kills to the source operands: they will me "moved" to the
- // resulting MUX and their parent instructions will be deleted.
- if (KilledSR1) {
- assert(Src1->isReg());
- Src1->setIsKill(true);
- }
- if (KilledSR2) {
- assert(Src2->isReg());
- Src2->setIsKill(true);
- }
- }
- } else {
- // If the MUX is placed "up", it shouldn't kill any source registers
- // that are still used afterwards. We can reset the kill flags directly
- // on the operands, because the source instructions will be erased.
- if (Used1 && Src1->isReg())
- Src1->setIsKill(false);
- if (Used2 && Src2->isReg())
- Src2->setIsKill(false);
- }
ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
}
- for (unsigned I = 0, N = ML.size(); I < N; ++I) {
- MuxInfo &MX = ML[I];
- MachineBasicBlock &B = *MX.At->getParent();
- DebugLoc DL = MX.At->getDebugLoc();
+ for (MuxInfo &MX : ML) {
unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
if (!MxOpc)
continue;
- BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
- .addReg(MX.PredR)
- .add(*MX.SrcT)
- .add(*MX.SrcF);
+ MachineBasicBlock &B = *MX.At->getParent();
+ const DebugLoc &DL = B.findDebugLoc(MX.At);
+ auto NewMux = BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+ .addReg(MX.PredR)
+ .add(*MX.SrcT)
+ .add(*MX.SrcF);
+ NewMux->clearKillInfo();
B.erase(MX.Def1);
B.erase(MX.Def2);
Changed = true;
}
+ // Fix up kill flags.
+
+ LivePhysRegs LPR(*HRI);
+ LPR.addLiveOuts(B);
+ auto IsLive = [&LPR,this] (unsigned Reg) -> bool {
+ for (MCSubRegIterator S(Reg, HRI, true); S.isValid(); ++S)
+ if (LPR.contains(*S))
+ return true;
+ return false;
+ };
+ for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
+ if (I->isDebugValue())
+ continue;
+ // This isn't 100% accurate, but it's safe.
+ // It won't detect (as a kill) a case like this
+ // r0 = add r0, 1 <-- r0 should be "killed"
+ // ... = r0
+ for (MachineOperand &Op : I->operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ assert(Op.getSubReg() == 0 && "Should have physical registers only");
+ bool Live = IsLive(Op.getReg());
+ Op.setIsKill(!Live);
+ }
+ LPR.stepBackward(*I);
+ }
+
return Changed;
}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index f43101fa456d5..fec2dc5ce3066 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -94,10 +94,6 @@ static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
///
/// Constants for Hexagon instructions.
///
-const int Hexagon_MEMV_OFFSET_MAX_128B = 896; // #s4: -8*128...7*128
-const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
-const int Hexagon_MEMV_OFFSET_MAX = 448; // #s4: -8*64...7*64
-const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
const int Hexagon_MEMW_OFFSET_MAX = 4095;
const int Hexagon_MEMW_OFFSET_MIN = -4096;
const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -2443,8 +2439,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
case Hexagon::V6_vS32b_ai:
case Hexagon::V6_vL32Ub_ai:
case Hexagon::V6_vS32Ub_ai:
- return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
- (Offset <= Hexagon_MEMV_OFFSET_MAX);
+ return isShiftedInt<4,6>(Offset);
case Hexagon::PS_vstorerq_ai_128B:
case Hexagon::PS_vstorerw_ai_128B:
@@ -2454,8 +2449,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
case Hexagon::V6_vS32b_ai_128B:
case Hexagon::V6_vL32Ub_ai_128B:
case Hexagon::V6_vS32Ub_ai_128B:
- return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
- (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+ return isShiftedInt<4,7>(Offset);
case Hexagon::J2_loop0i:
case Hexagon::J2_loop1i:
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index d73fc7c73185d..de6b203015d8e 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -629,7 +629,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
if (MO.isReg() && MO.isUse()) {
unsigned feederReg = MO.getReg();
for (MachineBasicBlock::iterator localII = feederPos,
- end = jmpPos; localII != end; localII++) {
+ end = cmpInstr->getIterator(); localII != end; localII++) {
MachineInstr &localMI = *localII;
for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
MachineOperand &localMO = localMI.getOperand(j);
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index ee3209354688d..7d961a238ae28 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -100,9 +100,6 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
}
-
- private:
- void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
};
}
@@ -132,7 +129,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
PeepholeDoubleRegsMap.clear();
// Traverse the basic block.
- for (MachineInstr &MI : *MBB) {
+ for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) {
+ NextI = std::next(I);
+ MachineInstr &MI = *I;
// Look for sign extends:
// %vreg170<def> = SXTW %vreg166
if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
@@ -280,14 +279,13 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
if (NewOp) {
unsigned PSrc = MI.getOperand(PR).getReg();
if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
- MI.getOperand(PR).setReg(POrig);
+ BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
+ QII->get(NewOp), MI.getOperand(0).getReg())
+ .addReg(POrig)
+ .add(MI.getOperand(S2))
+ .add(MI.getOperand(S1));
MRI->clearKillFlags(POrig);
- MI.setDesc(QII->get(NewOp));
- // Swap operands S1 and S2.
- MachineOperand Op1 = MI.getOperand(S1);
- MachineOperand Op2 = MI.getOperand(S2);
- ChangeOpInto(MI.getOperand(S1), Op2);
- ChangeOpInto(MI.getOperand(S2), Op1);
+ MI.eraseFromParent();
}
} // if (NewOp)
} // if (!Done)
@@ -299,40 +297,6 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
- assert (&Dst != &Src && "Cannot duplicate into itself");
- switch (Dst.getType()) {
- case MachineOperand::MO_Register:
- if (Src.isReg()) {
- Dst.setReg(Src.getReg());
- Dst.setSubReg(Src.getSubReg());
- MRI->clearKillFlags(Src.getReg());
- } else if (Src.isImm()) {
- Dst.ChangeToImmediate(Src.getImm());
- } else {
- llvm_unreachable("Unexpected src operand type");
- }
- break;
-
- case MachineOperand::MO_Immediate:
- if (Src.isImm()) {
- Dst.setImm(Src.getImm());
- } else if (Src.isReg()) {
- Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
- false, Src.isDead(), Src.isUndef(),
- Src.isDebug());
- Dst.setSubReg(Src.getSubReg());
- } else {
- llvm_unreachable("Unexpected src operand type");
- }
- break;
-
- default:
- llvm_unreachable("Unexpected dst operand type");
- break;
- }
-}
-
FunctionPass *llvm::createHexagonPeephole() {
return new HexagonPeephole();
}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 8851a23ae8ace..0aada8a53c979 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,13 +11,23 @@
//
//===----------------------------------------------------------------------===//
-#include "HexagonSubtarget.h"
#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
#include <map>
using namespace llvm;
@@ -119,9 +129,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
StringRef FS, const TargetMachine &TM)
: HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
- InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
- FrameLowering() {
-
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
initializeEnvironment();
// Initialize scheduling itinerary for the specified CPU.
@@ -196,7 +204,6 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
updateLatency(*SrcInst, *DstInst, Dep);
}
-
void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
for (auto &SU : DAG->SUnits) {
if (!SU.isInstr())
@@ -240,18 +247,18 @@ void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
}
}
-
void HexagonSubtarget::getPostRAMutations(
- std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(
+ llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
}
void HexagonSubtarget::getSMSMutations(
- std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(
+ llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
}
-
// Pin the vtable to this file.
void HexagonSubtarget::anchor() {}
@@ -447,4 +454,3 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const {
bool HexagonSubtarget::enableSubRegLiveness() const {
return EnableSubregLiveness;
}
-
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 4379efa79c9cd..753dca0000652 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,12 +15,17 @@
#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
#include "HexagonFrameLowering.h"
-#include "HexagonISelLowering.h"
#include "HexagonInstrInfo.h"
+#include "HexagonISelLowering.h"
#include "HexagonSelectionDAGInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
#include <string>
+#include <vector>
#define GET_SUBTARGETINFO_HEADER
#include "HexagonGenSubtargetInfo.inc"
@@ -30,6 +35,12 @@
namespace llvm {
+class MachineInstr;
+class SDep;
+class SUnit;
+class TargetMachine;
+class Triple;
+
class HexagonSubtarget : public HexagonGenSubtargetInfo {
virtual void anchor();
@@ -57,6 +68,7 @@ private:
HexagonSelectionDAGInfo TSInfo;
HexagonFrameLowering FrameLowering;
InstrItineraryData InstrItins;
+
void initializeEnvironment();
public:
@@ -108,6 +120,7 @@ public:
bool useBSBScheduling() const { return UseBSBScheduling; }
bool enableMachineScheduler() const override;
+
// Always use the TargetLowering default scheduler.
// FIXME: This will use the vliw scheduler which is probably just hurting
// compiler time and will be removed eventually anyway.
@@ -124,6 +137,7 @@ public:
unsigned getSmallDataThreshold() const {
return Hexagon_SMALL_DATA_THRESHOLD;
}
+
const HexagonArchEnum &getHexagonArchVersion() const {
return HexagonArchVersion;
}
@@ -155,4 +169,4 @@ private:
} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e507a797871fc..031a1bdefafbf 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -291,7 +291,6 @@ bool HexagonPassConfig::addInstSelector() {
if (EnableBitSimplify)
addPass(createHexagonBitSimplify());
addPass(createHexagonPeephole());
- printAndVerify("After hexagon peephole pass");
// Constant propagation.
if (!DisableHCP) {
addPass(createHexagonConstPropagationPass());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 545c8b6b2acde..093ce80bc2e3f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -202,10 +202,8 @@ public:
/// processFixupValue - Target hook to adjust the literal value of a fixup
/// if necessary. IsResolved signals whether the caller believes a relocation
/// is needed; the target can modify the value. The default does nothing.
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override {
+ void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, bool &IsResolved) override {
MCFixupKind Kind = Fixup.getKind();
switch((unsigned)Kind) {
@@ -415,9 +413,9 @@ public:
/// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
/// data fragment, at the offset specified by the fixup and following the
/// fixup kind as appropriate.
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t FixupValue, bool IsPCRel,
- MCContext &Ctx) const override {
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t FixupValue, bool IsPCRel) const override {
// When FixupValue is 0 the relocation is external and there
// is nothing for us to do.
@@ -432,8 +430,8 @@ public:
// to a real offset before we can use it.
uint32_t Offset = Fixup.getOffset();
unsigned NumBytes = getFixupKindNumBytes(Kind);
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
- char *InstAddr = Data + Offset;
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ char *InstAddr = Data.data() + Offset;
Value = adjustFixupValue(Kind, FixupValue);
if(!Value)
@@ -517,7 +515,7 @@ public:
dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
": AValue=0x"; dbgs().write_hex(FixupValue) <<
": Offset=" << Offset <<
- ": Size=" << DataSize <<
+ ": Size=" << Data.size() <<
": OInst=0x"; dbgs().write_hex(OldData) <<
": Reloc=0x"; dbgs().write_hex(Reloc););
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index e8f154a1fa533..c7114c7f18a0a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -701,33 +701,32 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
break;
case Hexagon::A2_addi:
Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
- assert(Absolute);(void)Absolute;
- if (Value == 1) {
- Result.setOpcode(Hexagon::SA1_inc);
- addOps(Result, Inst, 0);
- addOps(Result, Inst, 1);
- break;
- } // 1,2 SUBInst $Rd = add($Rs, #1)
- else if (Value == -1) {
- Result.setOpcode(Hexagon::SA1_dec);
- addOps(Result, Inst, 0);
- addOps(Result, Inst, 1);
- addOps(Result, Inst, 2);
- break;
- } // 1,2 SUBInst $Rd = add($Rs,#-1)
- else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
- Result.setOpcode(Hexagon::SA1_addsp);
- addOps(Result, Inst, 0);
- addOps(Result, Inst, 2);
- break;
- } // 1,3 SUBInst $Rd = add(r29, #$u6_2)
- else {
- Result.setOpcode(Hexagon::SA1_addi);
- addOps(Result, Inst, 0);
- addOps(Result, Inst, 1);
- addOps(Result, Inst, 2);
- break;
- } // 1,2,3 SUBInst $Rx = add($Rx, #$s7)
+ if (Absolute) {
+ if (Value == 1) {
+ Result.setOpcode(Hexagon::SA1_inc);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ break;
+ } // 1,2 SUBInst $Rd = add($Rs, #1)
+ if (Value == -1) {
+ Result.setOpcode(Hexagon::SA1_dec);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break;
+ } // 1,2 SUBInst $Rd = add($Rs,#-1)
+ if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+ Result.setOpcode(Hexagon::SA1_addsp);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 2);
+ break;
+ } // 1,3 SUBInst $Rd = add(r29, #$u6_2)
+ }
+ Result.setOpcode(Hexagon::SA1_addi);
+ addOps(Result, Inst, 0);
+ addOps(Result, Inst, 1);
+ addOps(Result, Inst, 2);
+ break; // 1,2,3 SUBInst $Rx = add($Rx, #$s7)
case Hexagon::A2_add:
Result.setOpcode(Hexagon::SA1_addrx);
addOps(Result, Inst, 0);
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 0ef1401ef531a..c212726113ab7 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -49,8 +49,9 @@ public:
LanaiAsmBackend(const Target &T, Triple::OSType OST)
: MCAsmBackend(), OSType(OST) {}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
@@ -88,9 +89,10 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
return true;
}
-void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned /*DataSize*/, uint64_t Value,
- bool /*IsPCRel*/, MCContext & /*Ctx*/) const {
+void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool /*IsPCRel*/) const {
MCFixupKind Kind = Fixup.getKind();
Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index d8fdc8ba674e6..982c6fea62d44 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -32,16 +32,20 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
return *RM;
}
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+ const TargetOptions &Options) {
+ return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
+}
+
MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
Optional<Reloc::Model> RM,
CodeModel::Model CM,
CodeGenOpt::Level OL)
- : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
+ : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
Options, getEffectiveRelocModel(RM), CM, OL),
TLOF(make_unique<TargetLoweringObjectFileELF>()),
- // FIXME: Check DataLayout string.
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
}
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 694c201cbe8dc..9d5c179a0fd90 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -322,6 +322,7 @@ class MipsAsmParser : public MCTargetAsmParser {
bool parseDirectiveSet();
bool parseDirectiveOption();
bool parseInsnDirective();
+ bool parseRSectionDirective(StringRef Section);
bool parseSSectionDirective(StringRef Section, unsigned Type);
bool parseSetAtDirective();
@@ -5106,7 +5107,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
CC = StringSwitch<unsigned>(Name)
.Case("zero", 0)
- .Case("at", 1)
+ .Cases("at", "AT", 1)
.Case("a0", 4)
.Case("a1", 5)
.Case("a2", 6)
@@ -6952,6 +6953,23 @@ bool MipsAsmParser::parseInsnDirective() {
return false;
}
+/// parseRSectionDirective
+/// ::= .rdata
+bool MipsAsmParser::parseRSectionDirective(StringRef Section) {
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ MCSection *ELFSection = getContext().getELFSection(
+ Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+ getParser().getStreamer().SwitchSection(ELFSection);
+
+ getParser().Lex(); // Eat EndOfStatement token.
+ return false;
+}
+
/// parseSSectionDirective
/// ::= .sbss
/// ::= .sdata
@@ -7499,6 +7517,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
parseInsnDirective();
return false;
}
+ if (IDVal == ".rdata") {
+ parseRSectionDirective(".rodata");
+ return false;
+ }
if (IDVal == ".sbss") {
parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
return false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 6d3d4db036032..ae48d6e38fa0f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -235,10 +235,12 @@ static unsigned calculateMMLEIndex(unsigned i) {
/// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
/// data fragment, at the offset specified by the fixup and following the
/// fixup kind as appropriate.
-void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value, bool IsPCRel,
- MCContext &Ctx) const {
+void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
MCFixupKind Kind = Fixup.getKind();
+ MCContext &Ctx = Asm.getContext();
Value = adjustFixupValue(Fixup, Value, Ctx);
if (!Value)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 4b3cc6e21f4cd..bf3b290b7ed53 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -38,8 +38,9 @@ public:
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 3272319ad50f4..7daea163b8a64 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -326,9 +326,9 @@ class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
InstrItinClass itin = NoItinerary>
: MipsR6Arch<instr_asm> {
- dag OutOperandList = (outs GPROpnd:$rs);
- dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
- string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
list<dag> Pattern = [];
InstrItinClass Itinerary = itin;
}
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 68708dc4f50fe..02102d6b22f4e 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -907,6 +907,11 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1))))
return SDValue();
}
+ // Don't generate INS if constant OR operand doesn't fit into bits
+ // cleared by constant AND operand.
+ if (CN->getSExtValue() & CN1->getSExtValue())
+ return SDValue();
+
SDLoc DL(N);
EVT ValTy = N->getOperand(0)->getValueType(0);
SDValue Const1;
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 272595af5f6f1..b95f1158fa562 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -274,8 +274,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
if (IsPIC) {
MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(FallThroughMBB, BalTgtMBB);
- LongBrMBB->addSuccessor(BalTgtMBB, BranchProbability::getOne());
- BalTgtMBB->addSuccessor(&*FallThroughMBB, BranchProbability::getOne());
+ LongBrMBB->addSuccessor(BalTgtMBB);
+ BalTgtMBB->addSuccessor(TgtMBB);
// We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
// instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
@@ -342,8 +342,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
.addReg(Mips::SP).addImm(8);
if (Subtarget.hasMips32r6())
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR), Mips::ZERO)
- .addReg(Mips::AT);
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
+ .addReg(Mips::ZERO).addReg(Mips::AT);
else
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
@@ -415,8 +415,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
.addReg(Mips::SP_64).addImm(0);
if (Subtarget.hasMips64r6())
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64), Mips::ZERO_64)
- .addReg(Mips::AT_64);
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
+ .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
else
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 2382ea2716612..b57bceb3c8371 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1257,19 +1257,22 @@ static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
EVT ResVecTy = Op->getValueType(0);
EVT ViaVecTy = ResVecTy;
+ bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian();
SDLoc DL(Op);
// When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
// LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
// lanes.
- SDValue LaneA;
- SDValue LaneB = Op->getOperand(2);
+ SDValue LaneA = Op->getOperand(OpNr);
+ SDValue LaneB;
if (ResVecTy == MVT::v2i64) {
- LaneA = DAG.getConstant(0, DL, MVT::i32);
+ LaneB = DAG.getConstant(0, DL, MVT::i32);
ViaVecTy = MVT::v4i32;
+ if(BigEndian)
+ std::swap(LaneA, LaneB);
} else
- LaneA = LaneB;
+ LaneB = LaneA;
SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
@@ -1277,8 +1280,11 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
SDValue Result = DAG.getBuildVector(
ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
- if (ViaVecTy != ResVecTy)
- Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+ if (ViaVecTy != ResVecTy) {
+ SDValue One = DAG.getConstant(1, DL, ViaVecTy);
+ Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy,
+ DAG.getNode(ISD::AND, DL, ViaVecTy, Result, One));
+ }
return Result;
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 028c2cb562f8e..6d7eb786a6835 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -113,8 +113,9 @@ public:
return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override {
Value = adjustFixupValue(Fixup.getKind(), Value);
if (!Value) return; // Doesn't change encoding.
@@ -130,10 +131,8 @@ public:
}
}
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override {
+ void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, bool &IsResolved) override {
switch ((PPC::Fixups)Fixup.getKind()) {
default: break;
case PPC::fixup_ppc_br24:
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 38ae62b26757a..07c9c1f9f84c0 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -24,7 +24,6 @@ namespace llvm {
class PPCTargetMachine;
class PassRegistry;
class FunctionPass;
- class ImmutablePass;
class MachineInstr;
class AsmPrinter;
class MCInst;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 57a1d373c88cf..c2c115cb6dafa 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -521,7 +521,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
bool HasBP = RegInfo->hasBasePointer(MF);
unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
- unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+ unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI)
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 662550f7a396a..72f14e9691382 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2560,8 +2560,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
PPCII::MO_TPREL_HA);
SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
PPCII::MO_TPREL_LO);
- SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
- is64bit ? MVT::i64 : MVT::i32);
+ SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
+ : DAG.getRegister(PPC::R2, MVT::i32);
+
SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
}
@@ -8377,9 +8378,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (IntrinsicID == Intrinsic::thread_pointer) {
// Reads the thread pointer register, used for __builtin_thread_pointer.
- bool is64bit = Subtarget.isPPC64();
- return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
- is64bit ? MVT::i64 : MVT::i32);
+ if (Subtarget.isPPC64())
+ return DAG.getRegister(PPC::X13, MVT::i64);
+ return DAG.getRegister(PPC::R2, MVT::i32);
}
// If this is a lowered altivec predicate compare, CompareOpc is set to the
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 70536a6039b82..e2af5e5295445 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -972,13 +972,15 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
// Support for medium and large code model.
let hasSideEffects = 0 in {
+let isReMaterializable = 1 in {
def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
"#ADDIStocHA", []>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+ "#ADDItocL", []>, isPPC64;
+}
let mayLoad = 1 in
def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
"#LDtocL", []>, isPPC64;
-def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
- "#ADDItocL", []>, isPPC64;
}
// Support for thread-local storage.
@@ -994,7 +996,7 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
(PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
isPPC64;
-let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
+let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 236e513bec231..13b4f9ab962da 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -292,6 +292,29 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
return 0;
}
+// For opcodes with the ReMaterializable flag set, this function is called to
+// verify the instruction is really rematable.
+bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const {
+ switch (MI.getOpcode()) {
+ default:
+ // This function should only be called for opcodes with the ReMaterializable
+ // flag set.
+ llvm_unreachable("Unknown rematerializable operation!");
+ break;
+ case PPC::LI:
+ case PPC::LI8:
+ case PPC::LIS:
+ case PPC::LIS8:
+ case PPC::QVGPCI:
+ case PPC::ADDIStocHA:
+ case PPC::ADDItocL:
+ case PPC::LOAD_STACK_GUARD:
+ return true;
+ }
+ return false;
+}
+
unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
// Note: This list must be kept consistent with StoreRegToStackSlot.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 8dd4dbb608794..b0629c88cf57b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -162,6 +162,8 @@ public:
unsigned &SubIdx) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AliasAnalysis *AA) const override;
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 637e52bbdbeec..8af7f7e981171 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -389,9 +389,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
- BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
- .addReg(PPC::R31)
- .addImm(FrameSize);
+ if (LP64)
+ BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg)
+ .addReg(PPC::X31)
+ .addImm(FrameSize);
+ else
+ BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+ .addReg(PPC::R31)
+ .addImm(FrameSize);
} else if (LP64) {
BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
.addImm(0)
@@ -478,8 +483,10 @@ void PPCRegisterInfo::lowerDynamicAreaOffset(
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+ bool is64Bit = TM.isPPC64();
DebugLoc dl = MI.getDebugLoc();
- BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+ BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI),
+ MI.getOperand(0).getReg())
.addImm(maxCallFrameSize);
MBB.erase(II);
}
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 5a226b23ff96f..a88a6541e8d00 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -86,9 +86,9 @@ EnableMachineCombinerPass("ppc-machine-combiner",
extern "C" void LLVMInitializePowerPCTarget() {
// Register the targets
- RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target());
- RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target());
- RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget());
+ RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
+ RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
+ RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializePPCBoolRetToIntPass(PR);
@@ -177,32 +177,34 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
assert(Options.MCOptions.getABIName().empty() &&
"Unknown target-abi option!");
- if (!TT.isMacOSX()) {
- switch (TT.getArch()) {
- case Triple::ppc64le:
- return PPCTargetMachine::PPC_ABI_ELFv2;
- case Triple::ppc64:
- return PPCTargetMachine::PPC_ABI_ELFv1;
- default:
- // Fallthrough.
- ;
- }
+ if (TT.isMacOSX())
+ return PPCTargetMachine::PPC_ABI_UNKNOWN;
+
+ switch (TT.getArch()) {
+ case Triple::ppc64le:
+ return PPCTargetMachine::PPC_ABI_ELFv2;
+ case Triple::ppc64:
+ return PPCTargetMachine::PPC_ABI_ELFv1;
+ default:
+ return PPCTargetMachine::PPC_ABI_UNKNOWN;
}
- return PPCTargetMachine::PPC_ABI_UNKNOWN;
}
static Reloc::Model getEffectiveRelocModel(const Triple &TT,
Optional<Reloc::Model> RM) {
- if (!RM.hasValue()) {
- if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
- if (!TT.isOSBinFormatMachO() && !TT.isMacOSX())
- return Reloc::PIC_;
- }
- if (TT.isOSDarwin())
- return Reloc::DynamicNoPIC;
- return Reloc::Static;
- }
- return *RM;
+ if (RM.hasValue())
+ return *RM;
+
+ // Darwin defaults to dynamic-no-pic.
+ if (TT.isOSDarwin())
+ return Reloc::DynamicNoPIC;
+
+ // Non-darwin 64-bit platforms are PIC by default.
+ if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)
+ return Reloc::PIC_;
+
+ // 32-bit is static by default.
+ return Reloc::Static;
}
// The FeatureString here is a little subtle. We are modifying the feature
@@ -224,26 +226,6 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
PPCTargetMachine::~PPCTargetMachine() = default;
-void PPC32TargetMachine::anchor() {}
-
-PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Optional<Reloc::Model> RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL)
- : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
-void PPC64TargetMachine::anchor() {}
-
-PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Optional<Reloc::Model> RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL)
- : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
const PPCSubtarget *
PPCTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -406,7 +388,7 @@ void PPCPassConfig::addPreRegAlloc() {
// FIXME: We probably don't need to run these for -fPIE.
if (getPPCTargetMachine().isPositionIndependent()) {
// FIXME: LiveVariables should not be necessary here!
- // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+ // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on
// LiveVariables. This (unnecessary) dependency has been removed now,
// however a stage-2 clang build fails without LiveVariables computed here.
addPass(&LiveVariablesID, false);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index b8f5a2083d808..5eb6ba785d1b8 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -23,7 +23,7 @@ namespace llvm {
/// Common code between 32-bit and 64-bit PowerPC targets.
///
-class PPCTargetMachine : public LLVMTargetMachine {
+class PPCTargetMachine final : public LLVMTargetMachine {
public:
enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
private:
@@ -60,29 +60,6 @@ public:
return false;
}
};
-
-/// PowerPC 32-bit target machine.
-///
-class PPC32TargetMachine : public PPCTargetMachine {
- virtual void anchor();
-public:
- PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
- StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
-};
-
-/// PowerPC 64-bit target machine.
-///
-class PPC64TargetMachine : public PPCTargetMachine {
- virtual void anchor();
-public:
- PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
- StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
-};
-
} // end namespace llvm
#endif
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index d9a71893afee7..f85c0cf111c43 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -32,8 +32,9 @@ public:
: MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
~RISCVAsmBackend() override {}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
@@ -69,9 +70,10 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
return true;
}
-void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
return;
}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index d1d1334163a26..c72b47b090857 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -203,10 +203,8 @@ namespace {
return InfosBE[Kind - FirstTargetFixupKind];
}
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override {
+ void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, bool &IsResolved) override {
switch ((Sparc::Fixups)Fixup.getKind()) {
default: break;
case Sparc::fixup_sparc_wplt30:
@@ -273,9 +271,9 @@ namespace {
ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
SparcAsmBackend(T), OSType(OSType) { }
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel,
- MCContext &Ctx) const override {
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override {
Value = adjustFixupValue(Fixup.getKind(), Value);
if (!Value) return; // Doesn't change encoding.
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 627e49a95f3cc..2c040dce994b6 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -15,6 +15,12 @@
using namespace llvm;
+void SparcELFTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
MachineModuleInfo *MMI, MCStreamer &Streamer) const {
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h
index fe8800625a567..3b1b345c3b193 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -23,6 +23,8 @@ public:
TargetLoweringObjectFileELF()
{}
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
unsigned Encoding,
const TargetMachine &TM,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index fd1fd7bc40dcc..6b32a7926437a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -50,8 +50,9 @@ public:
return SystemZ::NumTargetFixupKinds;
}
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
bool mayNeedRelaxation(const MCInst &Inst) const override {
return false;
}
@@ -89,15 +90,17 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
return Infos[Kind - FirstTargetFixupKind];
}
-void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
MCFixupKind Kind = Fixup.getKind();
unsigned Offset = Fixup.getOffset();
unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
unsigned Size = (BitSize + 7) / 8;
- assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+ assert(Offset + Size <= Data.size() && "Invalid fixup offset!");
// Big-endian insertion of Size bytes.
Value = extractBitsForFixup(Kind, Value);
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a28a91e834f61..0cb2b5a14ce73 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -277,8 +277,21 @@ void SystemZFrameLowering::
processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const {
MachineFrameInfo &MFFrame = MF.getFrameInfo();
- uint64_t MaxReach = (MFFrame.estimateStackSize(MF) +
- SystemZMC::CallFrameSize * 2);
+ // Get the size of our stack frame to be allocated ...
+ uint64_t StackSize = (MFFrame.estimateStackSize(MF) +
+ SystemZMC::CallFrameSize);
+ // ... and the maximum offset we may need to reach into the
+ // caller's frame to access the save area or stack arguments.
+ int64_t MaxArgOffset = SystemZMC::CallFrameSize;
+ for (int I = MFFrame.getObjectIndexBegin(); I != 0; ++I)
+ if (MFFrame.getObjectOffset(I) >= 0) {
+ int64_t ArgOffset = SystemZMC::CallFrameSize +
+ MFFrame.getObjectOffset(I) +
+ MFFrame.getObjectSize(I);
+ MaxArgOffset = std::max(MaxArgOffset, ArgOffset);
+ }
+
+ uint64_t MaxReach = StackSize + MaxArgOffset;
if (!isUInt<12>(MaxReach)) {
// We may need register scavenging slots if some parts of the frame
// are outside the reach of an unsigned 12-bit displacement.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index ac4c3f6db684d..fef4a8c92a362 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1322,11 +1322,6 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
}
-SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
- SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
- return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
-}
-
// Return true if Op is an intrinsic node with chain that returns the CC value
// as its only (other) argument. Provide the associated SystemZISD opcode and
// the mask of valid CC values if so.
@@ -2059,6 +2054,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
if (NewC.ICmpType != SystemZICMP::SignedOnly &&
NewC.Op0.getOpcode() == ISD::SHL &&
isSimpleShift(NewC.Op0, ShiftVal) &&
+ (MaskVal >> ShiftVal != 0) &&
(NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
MaskVal >> ShiftVal,
CmpVal >> ShiftVal,
@@ -2068,6 +2064,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
} else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
NewC.Op0.getOpcode() == ISD::SRL &&
isSimpleShift(NewC.Op0, ShiftVal) &&
+ (MaskVal << ShiftVal != 0) &&
(NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
MaskVal << ShiftVal,
CmpVal << ShiftVal,
@@ -3212,12 +3209,15 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
}
-// Op is an atomic load. Lower it into a normal volatile load.
+// Op is an atomic load. Lower it into a serialization followed
+// by a normal volatile load.
SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
+ SDValue Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
+ MVT::Other, Node->getChain()), 0);
return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
- Node->getChain(), Node->getBasePtr(),
+ Chain, Node->getBasePtr(),
Node->getMemoryVT(), Node->getMemOperand());
}
@@ -4688,7 +4688,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(STRCMP);
OPCODE(SEARCH_STRING);
OPCODE(IPM);
- OPCODE(SERIALIZE);
OPCODE(MEMBARRIER);
OPCODE(TBEGIN);
OPCODE(TBEGIN_NOFLOAT);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 79c8c4d92669f..5dcb19c0a35db 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -139,9 +139,6 @@ enum NodeType : unsigned {
// Store the CC value in bits 29 and 28 of an integer.
IPM,
- // Perform a serialization operation. (BCR 15,0 or BCR 14,0.)
- SERIALIZE,
-
// Compiler barrier only; generate a no-op.
MEMBARRIER,
@@ -471,8 +468,6 @@ public:
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
- SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
- SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
ISD::NodeType getExtendForAtomicOps() const override {
@@ -522,7 +517,6 @@ private:
unsigned Opcode) const;
SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index fa5ecdd852433..9f5e6288348e0 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -189,18 +189,15 @@ let isBranch = 1, isTerminator = 1 in {
//===----------------------------------------------------------------------===//
// Unconditional trap.
-// FIXME: This trap instruction should be marked as isTerminator, but there is
-// currently a general bug that allows non-terminators to be placed between
-// terminators. Temporarily leave this unmarked until the bug is fixed.
-let isBarrier = 1, hasCtrlDep = 1 in
+let hasCtrlDep = 1 in
def Trap : Alias<4, (outs), (ins), [(trap)]>;
// Conditional trap.
-let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in
+let hasCtrlDep = 1, Uses = [CC] in
def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
// Fused compare-and-trap instructions.
-let isTerminator = 1, hasCtrlDep = 1 in {
+let hasCtrlDep = 1 in {
// These patterns work the same way as for compare-and-branch.
defm CRT : CmpBranchRRFcPair<"crt", 0xB972, GR32>;
defm CGRT : CmpBranchRRFcPair<"cgrt", 0xB960, GR64>;
@@ -1449,7 +1446,7 @@ let Predicates = [FeatureExecutionHint] in {
// A serialization instruction that acts as a barrier for all memory
// accesses, which expands to "bcr 14, 0".
let hasSideEffects = 1 in
-def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+def Serialize : Alias<2, (outs), (ins), []>;
// A pseudo instruction that serves as a compiler barrier.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index ab6020f3f1896..b6feaa49d8585 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
#ifndef NDEBUG
// Print the set of SUs
void SystemZPostRASchedStrategy::SUSet::
-dump(SystemZHazardRecognizer &HazardRec) {
+dump(SystemZHazardRecognizer &HazardRec) const {
dbgs() << "{";
for (auto &SU : *this) {
HazardRec.dumpSU(SU, dbgs());
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index 12357e0348a9e..3dfef388691e7 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -72,7 +72,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
// A set of SUs with a sorter and dump method.
struct SUSet : std::set<SUnit*, SUSorter> {
#ifndef NDEBUG
- void dump(SystemZHazardRecognizer &HazardRec);
+ void dump(SystemZHazardRecognizer &HazardRec) const;
#endif
};
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index adfc69c5d4cf4..ab2392809f3be 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -191,8 +191,6 @@ def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
def z_udivrem32 : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
def z_udivrem64 : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
-def z_serialize : SDNode<"SystemZISD::SERIALIZE", SDTNone,
- [SDNPHasChain, SDNPMayStore]>;
def z_membarrier : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
[SDNPHasChain, SDNPSideEffect]>;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index eb2f17a2091c3..a10ca64fa6329 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -51,8 +51,6 @@ public:
}
bool targetSchedulesPostRAScheduling() const override { return true; };
-
- bool isMachineVerifierClean() const override { return false; }
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 4f20096c15830..1357cb5735f8a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -37,8 +37,9 @@ public:
: MCAsmBackend(), Is64Bit(Is64Bit) {}
~WebAssemblyAsmBackendELF() override {}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
@@ -77,8 +78,9 @@ public:
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
@@ -105,9 +107,11 @@ bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
return true;
}
-void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const {
const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
@@ -119,7 +123,7 @@ void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
Value <<= Info.TargetOffset;
unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
@@ -163,9 +167,11 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
return true;
}
-void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const {
const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
@@ -177,7 +183,7 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
Value <<= Info.TargetOffset;
unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index c56c591def361..3e3b52fca5691 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
namespace {
class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
const MCInstrInfo &MCII;
- MCContext &Ctx;
// Implementation generated by tablegen.
uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -48,14 +47,12 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
const MCSubtargetInfo &STI) const override;
public:
- WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
- : MCII(mcii), Ctx(ctx) {}
+ WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
};
} // end anonymous namespace
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
- MCContext &Ctx) {
- return new WebAssemblyMCCodeEmitter(MCII, Ctx);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+ return new WebAssemblyMCCodeEmitter(MCII);
}
void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -89,11 +86,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
} else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
encodeSLEB128(int64_t(MO.getImm()), OS);
} else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
- Fixups.push_back(MCFixup::create(
- OS.tell() - Start, MCConstantExpr::create(MO.getImm(), Ctx),
- MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc()));
- ++MCNumFixups;
- encodeULEB128(uint64_t(MO.getImm()), OS);
+ llvm_unreachable("wasm globals should only be accessed symbolicly");
} else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
encodeSLEB128(int64_t(MO.getImm()), OS);
} else {
@@ -135,6 +128,9 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
PaddedSize = 5;
+ } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+ FixupKind = MCFixupKind(WebAssembly::fixup_code_global_index);
+ PaddedSize = 5;
} else {
llvm_unreachable("unexpected symbolic operand kind");
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 9fd3ec81c258f..9580eeaa33d73 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -74,7 +74,7 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo & /*MRI*/,
MCContext &Ctx) {
- return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+ return createWebAssemblyMCCodeEmitter(MCII);
}
static MCAsmBackend *createAsmBackend(const Target & /*T*/,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 0ba700a86b744..4d676c32a09c5 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -35,8 +35,7 @@ class raw_pwrite_stream;
Target &getTheWebAssemblyTarget32();
Target &getTheWebAssemblyTarget64();
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
- MCContext &Ctx);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 19e14f3261aa7..9cf77829f3bc2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -68,6 +68,8 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
bool IsFunction = IsFunctionExpr(Fixup.getValue());
switch (unsigned(Fixup.getKind())) {
+ case WebAssembly::fixup_code_global_index:
+ return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB;
case WebAssembly::fixup_code_sleb128_i32:
if (IsFunction)
return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index b999091e2d294..f51585a10ca12 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -96,13 +96,6 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
MCConstantExpr::create(Size, OutContext));
}
}
-
- if (!TM.getTargetTriple().isOSBinFormatELF()) {
- MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
- getTargetStreamer()->emitGlobal(MMIW.getGlobals());
- if (MMIW.hasStackPointerGlobal())
- getTargetStreamer()->emitStackPointer(MMIW.getStackPointerGlobal());
- }
}
void WebAssemblyAsmPrinter::EmitConstantPool() {
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 09338a4898e03..c980f4b87f916 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -63,12 +63,16 @@ class WebAssemblyFastISel final : public FastISel {
public:
// Innocuous defaults for our address.
Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
- void setKind(BaseKind K) { Kind = K; }
+ void setKind(BaseKind K) {
+ assert(!isSet() && "Can't change kind with non-zero base");
+ Kind = K;
+ }
BaseKind getKind() const { return Kind; }
bool isRegBase() const { return Kind == RegBase; }
bool isFIBase() const { return Kind == FrameIndexBase; }
void setReg(unsigned Reg) {
assert(isRegBase() && "Invalid base register access!");
+ assert(Base.Reg == 0 && "Overwriting non-zero register");
Base.Reg = Reg;
}
unsigned getReg() const {
@@ -77,6 +81,7 @@ class WebAssemblyFastISel final : public FastISel {
}
void setFI(unsigned FI) {
assert(isFIBase() && "Invalid base frame index access!");
+ assert(Base.FI == 0 && "Overwriting non-zero frame index");
Base.FI = FI;
}
unsigned getFI() const {
@@ -91,6 +96,13 @@ class WebAssemblyFastISel final : public FastISel {
int64_t getOffset() const { return Offset; }
void setGlobalValue(const GlobalValue *G) { GV = G; }
const GlobalValue *getGlobalValue() const { return GV; }
+ bool isSet() const {
+ if (isRegBase()) {
+ return Base.Reg != 0;
+ } else {
+ return Base.FI != 0;
+ }
+ }
};
/// Keep a pointer to the WebAssemblySubtarget around so that we can make the
@@ -297,6 +309,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
DenseMap<const AllocaInst *, int>::iterator SI =
FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end()) {
+ if (Addr.isSet()) {
+ return false;
+ }
Addr.setKind(Address::FrameIndexBase);
Addr.setFI(SI->second);
return true;
@@ -341,6 +356,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
break;
}
}
+ if (Addr.isSet()) {
+ return false;
+ }
Addr.setReg(getRegForValue(Obj));
return Addr.getReg() != 0;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 4209bc333f230..a37d6136e44ed 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -104,10 +104,10 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
const DebugLoc &DL) {
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ const char *ES = "__stack_pointer";
+ auto *SPSymbol = MF.createExternalSymbolName(ES);
if (MF.getSubtarget<WebAssemblySubtarget>()
.getTargetTriple().isOSBinFormatELF()) {
- const char *ES = "__stack_pointer";
- auto *SPSymbol = MF.createExternalSymbolName(ES);
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *PtrRC =
MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
@@ -125,10 +125,8 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
.addReg(SrcReg)
.addMemOperand(MMO);
} else {
- MachineModuleInfoWasm &MMIW =
- MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
- .addImm(MMIW.getStackPointerGlobal())
+ .addExternalSymbol(SPSymbol)
.addReg(SrcReg);
}
}
@@ -171,10 +169,11 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
unsigned SPReg = WebAssembly::SP32;
if (StackSize)
SPReg = MRI.createVirtualRegister(PtrRC);
+
+ const char *ES = "__stack_pointer";
+ auto *SPSymbol = MF.createExternalSymbolName(ES);
if (MF.getSubtarget<WebAssemblySubtarget>()
.getTargetTriple().isOSBinFormatELF()) {
- const char *ES = "__stack_pointer";
- auto *SPSymbol = MF.createExternalSymbolName(ES);
unsigned Zero = MRI.createVirtualRegister(PtrRC);
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
@@ -189,22 +188,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
.addReg(Zero) // addr
.addMemOperand(LoadMMO);
} else {
- auto &MMIW = MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
- if (!MMIW.hasStackPointerGlobal()) {
- MMIW.setStackPointerGlobal(MMIW.getGlobals().size());
-
- // Create the stack-pointer global. For now, just use the
- // Emscripten/Binaryen ABI names.
- wasm::Global G;
- G.Type = wasm::ValType::I32;
- G.Mutable = true;
- G.InitialValue = 0;
- G.InitialModule = "env";
- G.InitialName = "STACKTOP";
- MMIW.addGlobal(G);
- }
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
- .addImm(MMIW.getStackPointerGlobal());
+ .addExternalSymbol(SPSymbol);
}
bool HasBP = hasBP(MF);
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 6650191807dcb..ea9e3fa862ce2 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -170,28 +170,16 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
if (MI.mayStore()) {
Write = true;
- const MachineFunction &MF = *MI.getParent()->getParent();
- if (MF.getSubtarget<WebAssemblySubtarget>()
- .getTargetTriple().isOSBinFormatELF()) {
- // Check for stores to __stack_pointer.
- for (auto MMO : MI.memoperands()) {
- const MachinePointerInfo &MPI = MMO->getPointerInfo();
- if (MPI.V.is<const PseudoSourceValue *>()) {
- auto PSV = MPI.V.get<const PseudoSourceValue *>();
- if (const ExternalSymbolPseudoSourceValue *EPSV =
- dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
- if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
- StackPointer = true;
- }
- }
- } else {
- // Check for sets of the stack pointer.
- const MachineModuleInfoWasm &MMIW =
- MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
- if ((MI.getOpcode() == WebAssembly::SET_LOCAL_I32 ||
- MI.getOpcode() == WebAssembly::SET_LOCAL_I64) &&
- MI.getOperand(0).getImm() == MMIW.getStackPointerGlobal()) {
- StackPointer = true;
+ // Check for stores to __stack_pointer.
+ for (auto MMO : MI.memoperands()) {
+ const MachinePointerInfo &MPI = MMO->getPointerInfo();
+ if (MPI.V.is<const PseudoSourceValue *>()) {
+ auto PSV = MPI.V.get<const PseudoSourceValue *>();
+ if (const ExternalSymbolPseudoSourceValue *EPSV =
+ dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+ if (StringRef(EPSV->getSymbol()) == "__stack_pointer") {
+ StackPointer = true;
+ }
}
}
} else if (MI.hasOrderedMemoryRef()) {
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e5d3209ec6a97..d30cc724c203f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1705,8 +1705,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
unsigned Len = DotDispStr.size();
- unsigned Val = OrigDispVal + DotDispVal;
- InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+ InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal);
}
NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 7a9e4f4468ec7..914fb36f91a7d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -108,12 +108,12 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override {
unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
- assert(Fixup.getOffset() + Size <= DataSize &&
- "Invalid fixup offset!");
+ assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
// Check that uppper bits are either all zeros or all ones.
// Specifically ignore overflow/underflow as long as the leakage is
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 4097ef224d503..caf98bffb80de 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -153,8 +153,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
const MCSymbol *B_Base = Asm.getAtom(*B);
// Neither symbol can be modified.
- if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
- Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+ if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
Asm.getContext().reportError(Fixup.getLoc(),
"unsupported relocation of modified symbol");
return;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 105580c913a16..5892f1de33eec 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
#include "MCTargetDesc/X86FixupKinds.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCValue.h"
@@ -25,8 +26,8 @@ public:
X86WinCOFFObjectWriter(bool Is64Bit);
~X86WinCOFFObjectWriter() override = default;
- unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
- bool IsCrossSection,
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsCrossSection,
const MCAsmBackend &MAB) const override;
};
@@ -36,11 +37,19 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
: MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
: COFF::IMAGE_FILE_MACHINE_I386) {}
-unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
const MCFixup &Fixup,
bool IsCrossSection,
const MCAsmBackend &MAB) const {
- unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+ unsigned FixupKind = Fixup.getKind();
+ if (IsCrossSection) {
+ if (FixupKind != FK_Data_4) {
+ Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ }
+ FixupKind = FK_PCRel_4;
+ }
MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2777fa89330f6..e3aa227702bea 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -748,17 +748,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
else
CallOp = X86::CALLpcrel32;
- const char *Symbol;
- if (Is64Bit) {
- if (STI.isTargetCygMing()) {
- Symbol = "___chkstk_ms";
- } else {
- Symbol = "__chkstk";
- }
- } else if (STI.isTargetCygMing())
- Symbol = "_alloca";
- else
- Symbol = "_chkstk";
+ StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
MachineInstrBuilder CI;
MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
@@ -769,10 +759,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
// For the large code model, we have to call through a register. Use R11,
// as it is scratch in all supported calling conventions.
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
- .addExternalSymbol(Symbol);
+ .addExternalSymbol(MF.createExternalSymbolName(Symbol));
CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
} else {
- CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
+ .addExternalSymbol(MF.createExternalSymbolName(Symbol));
}
unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
@@ -783,13 +774,16 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
.addReg(SP, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- if (Is64Bit) {
+ if (STI.isTargetWin64() || !STI.isOSWindows()) {
+ // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
- // themselves. It also does not clobber %rax so we can reuse it when
+ // themselves. They also does not clobber %rax so we can reuse it when
// adjusting %rsp.
- BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
+ // All other platforms do not specify a particular ABI for the stack probe
+ // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
+ BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+ .addReg(SP)
+ .addReg(AX);
}
if (InProlog) {
@@ -978,7 +972,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->setCalleeSavedFrameSize(
X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
- bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+ bool UseRedZone = false;
+ bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
// The default stack probe size is 4096 if the function has no stackprobesize
// attribute.
@@ -1007,6 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
!TRI->needsStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
+ !UseStackProbe && // No stack probes.
!IsWin64CC && // Win64 has no Red Zone
!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
!MF.shouldSplitStack()) { // Regular stack
@@ -1015,6 +1011,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
MFI.setStackSize(StackSize);
+ UseRedZone = true;
}
// Insert stack pointer adjustment for later moving of return addr. Only
@@ -1192,6 +1189,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+ assert(!UseRedZone && "The Red Zone is not accounted for in stack probes");
+
// Check whether EAX is livein for this block.
bool isEAXAlive = isEAXLiveIn(MBB);
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2a1633de0a239..3c4589ab18f6f 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -204,6 +204,11 @@ namespace {
bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
+ template <class GatherScatterSDNode>
+ bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
bool selectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -1415,13 +1420,10 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
return false;
}
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
- SDValue &Scale, SDValue &Index,
- SDValue &Disp, SDValue &Segment) {
-
- MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
- if (!Mgs)
- return false;
+template <class GatherScatterSDNode>
+bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
+ GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment) {
X86ISelAddressMode AM;
unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1453,6 +1455,18 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
return true;
}
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent))
+ return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>(
+ Mgs, N, Base, Scale, Index, Disp, Segment);
+ if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent))
+ return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>(
+ X86Gather, N, Base, Scale, Index, Disp, Segment);
+ return false;
+}
+
/// Returns true if it is able to pattern match an addressing mode.
/// It returns the operands which make up the maximal addressing mode it can
/// match by reference.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 172eba0002d4f..f777e56289884 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
+
+ // TODO: These control memcmp expansion in CGP and are set low to prevent
+ // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+ MaxLoadsPerMemcmp = 1;
+ MaxLoadsPerMemcmpOptSize = 1;
+
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
@@ -14272,9 +14278,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// If we are inserting a element, see if we can do this more efficiently with
// a blend shuffle with a rematerializable vector than a costly integer
// insertion.
- // TODO: pre-SSE41 targets will tend to use bit masking - this could still
- // be beneficial if we are inserting several zeros and can combine the masks.
- if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+ if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+ 16 <= EltVT.getSizeInBits()) {
SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
@@ -17621,23 +17626,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
SDValue CmpOp0 = Cmp.getOperand(0);
+
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
// (select (x == 0), 0, -1) -> neg & sbb
if (isNullConstant(Y) &&
- (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
- SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
- SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
- DAG.getConstant(0, DL,
- CmpOp0.getValueType()),
- CmpOp0);
- SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- SDValue(Neg.getNode(), 1));
- return Res;
- }
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
@@ -18648,8 +18651,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
+ bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
- SplitStack;
+ SplitStack || EmitStackProbe;
SDLoc dl(Op);
// Get the inputs.
@@ -23705,6 +23709,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue RetOps[] = {Exract, NewGather.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
+ if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+ // There is a special case when the return type is v2i32 is illegal and
+ // the type legaizer extended it to v2i64. Without this conversion we end up
+ // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
+ // In order to avoid this situation, we'll build an X86 specific Gather node
+ // with index v2i64 and value type v4i32.
+ assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
+ "Unexpected type in masked gather");
+ Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
+ DAG.getBitcast(MVT::v4i32, Src0),
+ DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+ // The mask should match the destination type. Extending mask with zeroes
+ // is not necessary since instruction itself reads only two values from
+ // memory.
+ Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+
+ SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
+ NewGather.getValue(0), DAG);
+ SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
+ // This transformation is for optimization only.
+ // The type legalizer extended mask and index to 4 elements vector
+ // in order to match requirements of the common gather node - same
+ // vector width of index and value. X86 Gather node allows mismatch
+ // of vector width in order to select more optimal instruction at the
+ // end.
+ assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
+ "Unexpected type in masked gather");
+ if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
+ ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
+ Index.getOpcode() == ISD::CONCAT_VECTORS &&
+ Index.getOperand(1).isUndef()) {
+ Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
+ Index = Index.getOperand(0);
+ } else
+ return Op;
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+
+ SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+
+ }
return Op;
}
@@ -24508,6 +24563,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
+ case X86ISD::MGATHER: return "X86ISD::MGATHER";
}
return nullptr;
}
@@ -29868,7 +29924,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
if (N->getValueType(0) == MVT::i32)
Diff = (unsigned)Diff;
- bool isFastMultiplier = false;
+ bool IsFastMultiplier = false;
if (Diff < 10) {
switch ((unsigned char)Diff) {
default:
@@ -29880,12 +29936,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
case 5: // result = lea base(cond, cond*4)
case 8: // result = lea base( , cond*8)
case 9: // result = lea base(cond, cond*8)
- isFastMultiplier = true;
+ IsFastMultiplier = true;
break;
}
}
- if (isFastMultiplier) {
+ if (IsFastMultiplier) {
APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
@@ -34841,23 +34897,56 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
!Cmp.getOperand(0).getValueType().isInteger())
return SDValue();
- // (cmp Z, 1) sets the carry flag if Z is 0.
SDValue Z = Cmp.getOperand(0);
- SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
- DAG.getConstant(1, DL, Z.getValueType()));
+ EVT ZVT = Z.getValueType();
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) {
+ // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+ // fake operands:
+ // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+ // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+ if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+ SDValue Zero = DAG.getConstant(0, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ }
+
+ // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+ // with fake operands:
+ // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+ // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+ if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ }
+ }
- SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ // (cmp Z, 1) sets the carry flag if Z is 0.
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+
+ // Add the flags type for ADC/SBB nodes.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
- DAG.getConstant(-1ULL, DL, VT), NewCmp);
+ DAG.getConstant(-1ULL, DL, VT), Cmp1);
// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
- DAG.getConstant(0, DL, VT), NewCmp);
+ DAG.getConstant(0, DL, VT), Cmp1);
}
static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
@@ -34976,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
}
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
+ "Unexpected opcode for increment/decrement transform");
+
+ // Pseudo-legality check: getOnesVector() expects one of these types, so bail
+ // out and wait for legalization if we have an unsupported vector length.
+ EVT VT = N->getValueType(0);
+ if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ SDNode *N1 = N->getOperand(1).getNode();
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
+ return SDValue();
+
+ SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
+ unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
@@ -34995,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+ if (SDValue V = combineIncDecVector(N, DAG))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -35028,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, false))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+ if (SDValue V = combineIncDecVector(N, DAG))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -36335,3 +36456,22 @@ void X86TargetLowering::insertCopiesSplitCSR(
bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
}
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+ // If the function specifically requests stack probes, emit them.
+ if (MF.getFunction()->hasFnAttribute("probe-stack"))
+ return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+
+ // Generally, if we aren't on Windows, the platform ABI does not include
+ // support for stack probes, so don't emit them.
+ if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+ return "";
+
+ // We need a stack probe to conform to the Windows ABI. Choose the right
+ // symbol.
+ if (Subtarget.is64Bit())
+ return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+ return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index f51b6641db2fb..e1ade92979dc0 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -615,7 +615,10 @@ namespace llvm {
// Vector truncating store with unsigned/signed saturation
VTRUNCSTOREUS, VTRUNCSTORES,
// Vector truncating masked store with unsigned/signed saturation
- VMTRUNCSTOREUS, VMTRUNCSTORES
+ VMTRUNCSTOREUS, VMTRUNCSTORES,
+
+ // X86 specific gather
+ MGATHER
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -1056,6 +1059,8 @@ namespace llvm {
bool supportSwiftError() const override;
+ StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
/// \brief Lower interleaved load(s) into target specific
@@ -1065,6 +1070,12 @@ namespace llvm {
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
+ /// \brief Lower interleaved store(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const override;
+
+
void finalizeLowering(MachineFunction &MF) const override;
protected:
@@ -1397,6 +1408,19 @@ namespace llvm {
}
};
+ // X86 specific Gather node.
+ class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode {
+ public:
+ X86MaskedGatherSDNode(unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO)
+ {}
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER;
+ }
+ };
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2620679df2517..01a70323224c3 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -7265,13 +7265,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
let Predicates = [HasAVX512] in {
def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
@@ -7281,13 +7281,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x1))), _.FRC)>;
+ addr:$src, (i32 0x9))), _.FRC)>;
def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x2))), _.FRC)>;
+ addr:$src, (i32 0xa))), _.FRC)>;
def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x3))), _.FRC)>;
+ addr:$src, (i32 0xb))), _.FRC)>;
def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
addr:$src, (i32 0x4))), _.FRC)>;
@@ -7869,7 +7869,7 @@ let Predicates = [HasVLX] in {
defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
vx128xmem, mgatherv4i32>, EVEX_V128;
defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
- vx64xmem, mgatherv2i64>, EVEX_V128;
+ vx64xmem, X86mgatherv2i64>, EVEX_V128;
}
}
@@ -8471,26 +8471,26 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
}
let Predicates = [HasAVX512] in {
def : Pat<(v16f32 (ffloor VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
def : Pat<(v16f32 (fnearbyint VR512:$src)),
(VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
def : Pat<(v16f32 (fceil VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
def : Pat<(v16f32 (frint VR512:$src)),
(VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
def : Pat<(v16f32 (ftrunc VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
def : Pat<(v8f64 (ffloor VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
def : Pat<(v8f64 (fnearbyint VR512:$src)),
(VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
def : Pat<(v8f64 (fceil VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
def : Pat<(v8f64 (frint VR512:$src)),
(VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
def : Pat<(v8f64 (ftrunc VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
}
defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c28b35b22977a..8b5bbf24f6f63 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -641,7 +641,7 @@ def sdmem : Operand<v2f64> {
// SSE pattern fragments
//===----------------------------------------------------------------------===//
-// Vector load wrappers to prevent folding of non-temporal aligned loads on
+// Vector load wrappers to prevent folding of non-temporal aligned loads on
// supporting targets.
def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() ||
@@ -754,16 +754,6 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
-// These are needed to match a scalar memop that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def memopfsf32_128 : PatFrag<(ops node:$ptr),
- (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
-def memopfsf64_128 : PatFrag<(ops node:$ptr),
- (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
-
-
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
// 16-byte boundary.
// FIXME: 8 byte alignment for mmx reads is not required
@@ -773,6 +763,9 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
+def X86masked_gather : SDNode<"X86ISD::MGATHER", SDTMaskedGather,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_gather node:$src1, node:$src2, node:$src3) , [{
if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -796,6 +789,15 @@ def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
Mgt->getBasePtr().getValueType() == MVT::v2i64);
return false;
}]>;
+def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
+ (Mgt->getMemoryVT() == MVT::v2i32 ||
+ Mgt->getMemoryVT() == MVT::v2f32);
+ return false;
+}]>;
def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_gather node:$src1, node:$src2, node:$src3) , [{
if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8490b972eb5c1..fe87bbd994738 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1744,7 +1744,7 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
}
-def : Pat<(f32 (fpround FR64:$src)),
+def : Pat<(f32 (fpround FR64:$src)),
(VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
Requires<[UseAVX]>;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 77dead8d24137..f98c2a7e802dd 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -72,9 +72,24 @@ private:
MachineFunction &MF) const;
bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
-
bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+
+ // emit insert subreg instruction and insert it before MachineInstr &I
+ bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+ MachineRegisterInfo &MRI, MachineFunction &MF) const;
+ // emit extract subreg instruction and insert it before MachineInstr &I
+ bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+ MachineRegisterInfo &MRI, MachineFunction &MF) const;
+
+ const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const;
+ const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg,
+ MachineRegisterInfo &MRI) const;
const X86TargetMachine &TM;
const X86Subtarget &STI;
@@ -113,8 +128,8 @@ X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
// FIXME: This should be target-independent, inferred from the types declared
// for each class in the bank.
-static const TargetRegisterClass *
-getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
if (RB.getID() == X86::GPRRegBankID) {
if (Ty.getSizeInBits() <= 8)
return &X86::GR8RegClass;
@@ -127,13 +142,13 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
}
if (RB.getID() == X86::VECRRegBankID) {
if (Ty.getSizeInBits() == 32)
- return &X86::FR32XRegClass;
+ return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
if (Ty.getSizeInBits() == 64)
- return &X86::FR64XRegClass;
+ return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
if (Ty.getSizeInBits() == 128)
- return &X86::VR128XRegClass;
+ return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass;
if (Ty.getSizeInBits() == 256)
- return &X86::VR256XRegClass;
+ return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass;
if (Ty.getSizeInBits() == 512)
return &X86::VR512RegClass;
}
@@ -141,10 +156,16 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
llvm_unreachable("Unknown RegBank!");
}
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
+ MachineRegisterInfo &MRI) const {
+ const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI);
+ return getRegClass(Ty, RegBank);
+}
+
// Set X86 Opcode and constrain DestReg.
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
- MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
- const RegisterBankInfo &RBI) {
+bool X86InstructionSelector::selectCopy(MachineInstr &I,
+ MachineRegisterInfo &MRI) const {
unsigned DstReg = I.getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
@@ -171,7 +192,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
switch (RegBank.getID()) {
case X86::GPRRegBankID:
assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
- RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+ RC = getRegClass(MRI.getType(DstReg), RegBank);
// Change the physical register
if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
@@ -186,7 +207,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
}
break;
case X86::VECRRegBankID:
- RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+ RC = getRegClass(MRI.getType(DstReg), RegBank);
break;
default:
llvm_unreachable("Unknown RegBank!");
@@ -220,7 +241,7 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
// Certain non-generic instructions also need some special handling.
if (I.isCopy())
- return selectCopy(I, TII, MRI, TRI, RBI);
+ return selectCopy(I, MRI);
// TODO: handle more cases - LOAD_STACK_GUARD, PHI
return true;
@@ -249,6 +270,10 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
return true;
if (selectUadde(I, MRI, MF))
return true;
+ if (selectExtract(I, MRI, MF))
+ return true;
+ if (selectInsert(I, MRI, MF))
+ return true;
return false;
}
@@ -326,6 +351,34 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
return Opc;
}
+// Fill in an address from the given instruction.
+void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
+ X86AddressMode &AM) {
+
+ assert(I.getOperand(0).isReg() && "unsupported opperand.");
+ assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
+ "unsupported type.");
+
+ if (I.getOpcode() == TargetOpcode::G_GEP) {
+ if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+ int64_t Imm = *COff;
+ if (isInt<32>(Imm)) { // Check for displacement overflow.
+ AM.Disp = static_cast<int32_t>(Imm);
+ AM.Base.Reg = I.getOperand(1).getReg();
+ return;
+ }
+ }
+ } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+ AM.Base.FrameIndex = I.getOperand(1).getIndex();
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ return;
+ }
+
+ // Default behavior.
+ AM.Base.Reg = I.getOperand(0).getReg();
+ return;
+}
+
bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
@@ -340,18 +393,28 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
auto &MemOp = **I.memoperands_begin();
+ if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+ DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ return false;
+ }
+
unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
if (NewOpc == Opc)
return false;
+ X86AddressMode AM;
+ X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
+
I.setDesc(TII.get(NewOpc));
MachineInstrBuilder MIB(MF, I);
- if (Opc == TargetOpcode::G_LOAD)
- addOffset(MIB, 0);
- else {
+ if (Opc == TargetOpcode::G_LOAD) {
+ I.RemoveOperand(1);
+ addFullAddress(MIB, AM);
+ } else {
// G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+ I.RemoveOperand(1);
I.RemoveOperand(0);
- addOffset(MIB, 0).addUse(DefReg);
+ addFullAddress(MIB, AM).addUse(DefReg);
}
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -461,11 +524,11 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
if (DstRB.getID() != X86::GPRRegBankID)
return false;
- const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
if (!DstRC)
return false;
- const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
if (!SrcRC)
return false;
@@ -519,9 +582,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
else
return false;
- const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
unsigned DefReg =
- MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+ MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
@@ -656,6 +718,202 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
return true;
}
+bool X86InstructionSelector::selectExtract(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ if (I.getOpcode() != TargetOpcode::G_EXTRACT)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+ int64_t Index = I.getOperand(2).getImm();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ // Meanwile handle vector type only.
+ if (!DstTy.isVector())
+ return false;
+
+ if (Index % DstTy.getSizeInBits() != 0)
+ return false; // Not extract subvector.
+
+ if (Index == 0) {
+ // Replace by extract subreg copy.
+ if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
+ if (HasVLX)
+ I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr));
+ else if (HasAVX)
+ I.setDesc(TII.get(X86::VEXTRACTF128rr));
+ else
+ return false;
+ } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
+ if (DstTy.getSizeInBits() == 128)
+ I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr));
+ else if (DstTy.getSizeInBits() == 256)
+ I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr));
+ else
+ return false;
+ } else
+ return false;
+
+ // Convert to X86 VEXTRACT immediate.
+ Index = Index / DstTy.getSizeInBits();
+ I.getOperand(2).setImm(Index);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
+ MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned SubIdx = X86::NoSubRegister;
+
+ if (!DstTy.isVector() || !SrcTy.isVector())
+ return false;
+
+ assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() &&
+ "Incorrect Src/Dst register size");
+
+ if (DstTy.getSizeInBits() == 128)
+ SubIdx = X86::sub_xmm;
+ else if (DstTy.getSizeInBits() == 256)
+ SubIdx = X86::sub_ymm;
+ else
+ return false;
+
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg)
+ .addReg(SrcReg, 0, SubIdx);
+
+ return true;
+}
+
+bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
+ MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned SubIdx = X86::NoSubRegister;
+
+ // TODO: support scalar types
+ if (!DstTy.isVector() || !SrcTy.isVector())
+ return false;
+
+ assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() &&
+ "Incorrect Src/Dst register size");
+
+ if (SrcTy.getSizeInBits() == 128)
+ SubIdx = X86::sub_xmm;
+ else if (SrcTy.getSizeInBits() == 256)
+ SubIdx = X86::sub_ymm;
+ else
+ return false;
+
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY))
+ .addReg(DstReg, RegState::DefineNoRead, SubIdx)
+ .addReg(SrcReg);
+
+ return true;
+}
+
+bool X86InstructionSelector::selectInsert(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ if (I.getOpcode() != TargetOpcode::G_INSERT)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+ const unsigned InsertReg = I.getOperand(2).getReg();
+ int64_t Index = I.getOperand(3).getImm();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT InsertRegTy = MRI.getType(InsertReg);
+
+ // Meanwile handle vector type only.
+ if (!DstTy.isVector())
+ return false;
+
+ if (Index % InsertRegTy.getSizeInBits() != 0)
+ return false; // Not insert subvector.
+
+ if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) {
+ // Replace by subreg copy.
+ if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
+ if (HasVLX)
+ I.setDesc(TII.get(X86::VINSERTF32x4Z256rr));
+ else if (HasAVX)
+ I.setDesc(TII.get(X86::VINSERTF128rr));
+ else
+ return false;
+ } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
+ if (InsertRegTy.getSizeInBits() == 128)
+ I.setDesc(TII.get(X86::VINSERTF32x4Zrr));
+ else if (InsertRegTy.getSizeInBits() == 256)
+ I.setDesc(TII.get(X86::VINSERTF64x4Zrr));
+ else
+ return false;
+ } else
+ return false;
+
+ // Convert to X86 VINSERT immediate.
+ Index = Index / InsertRegTy.getSizeInBits();
+
+ I.getOperand(3).setImm(Index);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 806d6cc888f0f..f0ed4bc16e2f9 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -16,6 +16,7 @@
#include "X86ISelLowering.h"
#include "X86TargetMachine.h"
+#include "llvm/Analysis/VectorUtils.h"
using namespace llvm;
@@ -50,9 +51,8 @@ class X86InterleavedAccessGroup {
IRBuilder<> &Builder;
/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
- /// sub vectors of type \p T. Returns true and the sub-vectors in
- /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
- bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+ /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
+ void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
SmallVectorImpl<Instruction *> &DecomposedVectors);
/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -80,8 +80,7 @@ public:
/// target information \p STarget.
explicit X86InterleavedAccessGroup(Instruction *I,
ArrayRef<ShuffleVectorInst *> Shuffs,
- ArrayRef<unsigned> Ind,
- const unsigned F,
+ ArrayRef<unsigned> Ind, const unsigned F,
const X86Subtarget &STarget,
IRBuilder<> &B)
: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
@@ -102,48 +101,61 @@ bool X86InterleavedAccessGroup::isSupported() const {
uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
- if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
- return false;
+ // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
+ uint64_t ExpectedShuffleVecSize;
+ if (isa<LoadInst>(Inst))
+ ExpectedShuffleVecSize = 256;
+ else
+ ExpectedShuffleVecSize = 1024;
- // Currently, lowering is supported for 64 bits on AVX.
- if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+ if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
return false;
return true;
}
-bool X86InterleavedAccessGroup::decompose(
+void X86InterleavedAccessGroup::decompose(
Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
SmallVectorImpl<Instruction *> &DecomposedVectors) {
+
+ assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
+ "Expected Load or Shuffle");
+
Type *VecTy = VecInst->getType();
(void)VecTy;
assert(VecTy->isVectorTy() &&
DL.getTypeSizeInBits(VecTy) >=
DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
"Invalid Inst-size!!!");
- assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
- "Element type mismatched!!!");
- if (!isa<LoadInst>(VecInst))
- return false;
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+
+ // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
+ for (unsigned i = 0; i < NumSubVectors; ++i)
+ DecomposedVectors.push_back(
+ cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
+ Op0, Op1, createSequentialMask(Builder, Indices[i],
+ SubVecTy->getVectorNumElements(), 0))));
+ return;
+ }
+ // Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);
Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
-
Value *VecBasePtr =
Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
- // Generate N loads of T type
+ // Generate N loads of T type.
for (unsigned i = 0; i < NumSubVectors; i++) {
- // TODO: Support inbounds GEP
+ // TODO: Support inbounds GEP.
Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
DecomposedVectors.push_back(NewLoad);
}
-
- return true;
}
void X86InterleavedAccessGroup::transpose_4x4(
@@ -181,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4(
// instructions/intrinsics.
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
SmallVector<Instruction *, 4> DecomposedVectors;
- VectorType *VecTy = Shuffles[0]->getType();
- // Try to generate target-sized register(/instruction).
- if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
- return false;
-
SmallVector<Value *, 4> TransposedVectors;
- // Perform matrix-transposition in order to compute interleaved
- // results by generating some sort of (optimized) target-specific
- // instructions.
+ VectorType *ShuffleTy = Shuffles[0]->getType();
+
+ if (isa<LoadInst>(Inst)) {
+ // Try to generate target-sized register(/instruction).
+ decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+
+ // Now replace the unoptimized-interleaved-vectors with the
+ // transposed-interleaved vectors.
+ for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
+ Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+ return true;
+ }
+
+ Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
+ unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+
+ // Lower the interleaved stores:
+ // 1. Decompose the interleaved wide shuffle into individual shuffle
+ // vectors.
+ decompose(Shuffles[0], Factor,
+ VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+
+ // 2. Transpose the interleaved-vectors into vectors of contiguous
+ // elements.
transpose_4x4(DecomposedVectors, TransposedVectors);
- // Now replace the unoptimized-interleaved-vectors with the
- // transposed-interleaved vectors.
- for (unsigned i = 0; i < Shuffles.size(); i++)
- Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+ // 3. Concatenate the contiguous-vectors back into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, TransposedVectors);
+
+ // 4. Generate a store instruction for wide-vec.
+ StoreInst *SI = cast<StoreInst>(Inst);
+ Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
+ SI->getAlignment());
return true;
}
@@ -220,3 +257,29 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}
+
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+ "Invalid interleaved store");
+
+ // Holds the indices of SVI that correspond to the starting index of each
+ // interleaved shuffle.
+ SmallVector<unsigned, 4> Indices;
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++)
+ Indices.push_back(Mask[i]);
+
+ ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
+
+ // Create an interleaved access group.
+ IRBuilder<> Builder(SI);
+ X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
+ Builder);
+
+ return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index bc73bb1ae8c51..6b1add8ff8ed1 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -510,12 +510,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
@@ -524,16 +518,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
X86ISD::CMPM_RND),
- X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
- X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
@@ -1171,18 +1159,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUBS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 979aaee110aa4..a584eabcc1b28 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -214,12 +214,24 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
if (!Subtarget.hasAVX())
return;
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
const LLT v8s32 = LLT::vector(8, 32);
const LLT v4s64 = LLT::vector(4, 64);
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v8s32, v4s64})
setAction({MemOp, Ty}, Legal);
+
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+ setAction({G_INSERT, Ty}, Legal);
+ for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+ setAction({G_INSERT, 1, Ty}, Legal);
}
void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -243,6 +255,18 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
if (!Subtarget.hasAVX512())
return;
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v32s16 = LLT::vector(32, 16);
const LLT v16s32 = LLT::vector(16, 32);
const LLT v8s64 = LLT::vector(8, 64);
@@ -256,13 +280,15 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
for (auto Ty : {v16s32, v8s64})
setAction({MemOp, Ty}, Legal);
+ for (auto Ty : {v64s8, v32s16, v16s32, v8s64})
+ setAction({G_INSERT, Ty}, Legal);
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64})
+ setAction({G_INSERT, 1, Ty}, Legal);
+
/************ VLX *******************/
if (!Subtarget.hasVLX())
return;
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v8s32 = LLT::vector(8, 32);
-
for (auto Ty : {v4s32, v8s32})
setAction({G_MUL, Ty}, Legal);
}
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index dd21e2b7c4a13..8fdf10617059a 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -2,39 +2,31 @@
//
// The LLVM Compiler Infrastructure
//
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
-// This file contains the X86 implementation of the DAG scheduling mutation to
-// pair instructions back to back.
+/// \file This file contains the X86 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
//
//===----------------------------------------------------------------------===//
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetInstrInfo.h"
-
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
+#include "llvm/CodeGen/MacroFusion.h"
using namespace llvm;
-static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
- cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
-namespace {
-
-/// \brief Verify that the instruction pair, First and Second,
-/// should be scheduled back to back. If either instruction is unspecified,
-/// then verify that the other instruction may be part of a pair at all.
-static bool shouldScheduleAdjacent(const X86Subtarget &ST,
- const MachineInstr *First,
- const MachineInstr *Second) {
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
// Check if this processor supports macro-fusion. Since this is a minor
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
// proxy for SandyBridge+.
@@ -47,13 +39,10 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
FuseInc
} FuseKind;
- assert((First || Second) && "At least one instr must be specified");
- unsigned FirstOpcode = First
- ? First->getOpcode()
+ unsigned FirstOpcode = FirstMI
+ ? FirstMI->getOpcode()
: static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = Second
- ? Second->getOpcode()
- : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
switch (SecondOpcode) {
default:
@@ -203,69 +192,11 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
}
}
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class X86MacroFusion : public ScheduleDAGMutation {
-public:
- X86MacroFusion() {}
-
- void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
- ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
- const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>();
-
- // For now, assume targets can only fuse with the branch.
- SUnit &ExitSU = DAG->ExitSU;
- MachineInstr *Branch = ExitSU.getInstr();
- if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch))
- return;
-
- for (SDep &PredDep : ExitSU.Preds) {
- if (PredDep.isWeak())
- continue;
- SUnit &SU = *PredDep.getSUnit();
- MachineInstr &Pred = *SU.getInstr();
- if (!shouldScheduleAdjacent(ST, &Pred, Branch))
- continue;
-
- // Create a single weak edge from SU to ExitSU. The only effect is to cause
- // bottom-up scheduling to heavily prioritize the clustered SU. There is no
- // need to copy predecessor edges from ExitSU to SU, since top-down
- // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
- // of SU, we could create an artificial edge from the deepest root, but it
- // hasn't been needed yet.
- bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
- (void)Success;
- assert(Success && "No DAG nodes should be reachable from ExitSU");
-
- // Adjust latency of data deps between the nodes.
- for (SDep &PredDep : ExitSU.Preds)
- if (PredDep.getSUnit() == &SU)
- PredDep.setLatency(0);
- for (SDep &SuccDep : SU.Succs)
- if (SuccDep.getSUnit() == &ExitSU)
- SuccDep.setLatency(0);
-
- ++NumFused;
- DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
- SU.print(dbgs(), DAG);
- dbgs() << " - ExitSU"
- << " / " << DAG->TII->getName(Pred.getOpcode()) << " - "
- << DAG->TII->getName(Branch->getOpcode()) << '\n';);
-
- break;
- }
-}
-
-} // end namespace
-
namespace llvm {
std::unique_ptr<ScheduleDAGMutation>
createX86MacroFusionDAGMutation () {
- return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr;
+ return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
}
} // end namespace llvm
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
index e630f802e8e63..13fa2d78a0185 100644
--- a/lib/Target/X86/X86MacroFusion.h
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -2,23 +2,18 @@
//
// The LLVM Compiler Infrastructure
//
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
-// This file contains the X86 definition of the DAG scheduling mutation to pair
-// instructions back to back.
+/// \file This file contains the X86 definition of the DAG scheduling mutation
+/// to pair instructions back to back.
//
//===----------------------------------------------------------------------===//
-#include "X86InstrInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
-//===----------------------------------------------------------------------===//
-// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
namespace llvm {
/// Note that you have to add:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 278b57eb00b74..a9f42cacf7886 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -91,6 +91,8 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return llvm::make_unique<X86FreeBSDTargetObjectFile>();
if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+ if (TT.isOSSolaris())
+ return llvm::make_unique<X86SolarisTargetObjectFile>();
if (TT.isOSFuchsia())
return llvm::make_unique<X86FuchsiaTargetObjectFile>();
if (TT.isOSBinFormatELF())
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 4fd95717478e9..8627c06d44313 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -86,6 +86,12 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
InitializeELF(TM.Options.UseInitArray);
}
+void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
const GlobalValue *LHS, const GlobalValue *RHS,
const TargetMachine &TM) const {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 39d2e84e5ed77..f6aa570b6332a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -66,6 +66,11 @@ namespace llvm {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
+ /// \brief This implementation is used for Solaris on x86/x86-64.
+ class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
/// \brief This implementation is used for Windows targets on x86 and x86-64.
class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
const MCExpr *
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 11ba7025e1b73..5ba8534d32d33 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2178,17 +2178,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2) {
- // X86 specific here are "instruction number 1st priority".
- return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
- C1.NumIVMuls, C1.NumBaseAdds,
- C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
- std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
- C2.NumIVMuls, C2.NumBaseAdds,
- C2.ScaleCost, C2.ImmCost, C2.SetupCost);
-}
-
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?
@@ -2243,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+ // TODO: We can increase these based on available vector ops.
+ MaxLoadSize = ST->is64Bit() ? 8 : 4;
+ return true;
+}
+
bool X86TTIImpl::enableInterleavedAccessVectorization() {
// TODO: We expect this to be beneficial regardless of arch,
// but there are currently some unexplained performance artifacts on Atom.
@@ -2250,6 +2245,114 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
return !(ST->isAtom());
}
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // We currently Support only fully-interleaved groups, with no gaps.
+ // TODO: Support also strided loads (interleaved-groups with gaps).
+ if (Indices.size() && Indices.size() != Factor)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+ // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+ // the VF=2, while v2i128 is an unsupported MVT vector type
+ // (see MachineValueType.h::getVectorVT()).
+ if (!LegalVT.isVector())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ unsigned VF = VecTy->getVectorNumElements() / Factor;
+ Type *ScalarTy = VecTy->getVectorElementType();
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ VectorType *VT = VectorType::get(ScalarTy, VF);
+ EVT ETy = TLI->getValueType(DL, VT);
+ if (!ETy.isSimple())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // TODO: Complete for other data-types and strides.
+ // Each combination of Stride, ElementTy and VF results in a different
+ // sequence; The cost tables are therefore accessed with:
+ // Factor (stride) and VectorType=VFxElemType.
+ // The Cost accounts only for the shuffle sequence;
+ // The cost of the loads/stores is accounted for separately.
+ //
+ static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+ { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
+ { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
+ { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
+ { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8
+ { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+
+ { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
+ { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
+ { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
+ { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
+ { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ };
+
+ static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+ { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
+ { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
+ { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
+ { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+ { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+ { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
+ { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
+ { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store)
+ { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+ { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store)
+ };
+
+ if (Opcode == Instruction::Load) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ } else {
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
+
// Get estimation for interleaved load/store operations and strided load.
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
@@ -2358,6 +2461,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
+ if (ST->hasAVX2())
+ return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 09ce2c90498d9..ad0a0a2113012 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -93,6 +93,9 @@ public:
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace);
+ int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
int getIntImmCost(int64_t);
@@ -101,15 +104,13 @@ public:
int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
- bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2);
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
-
+ bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,